def preprocess_job_market(in_path, out_path): import os import json import codecs job_texts = [] for filename in glob.glob(in_path + '*.json'): try: with codecs.open(filename, encoding='utf-8') as job_file: content = json.load(job_file) job_texts.append(content.get('description', u'')) except: print("===Exception reading file " + filename) continue spacy_lang = en_core_web_sm.load() corpus = textacy.corpus.Corpus(spacy_lang) corpus_text = '\n'.join(text for text in job_texts) corpus.add_text(corpus_text) res_file = out_path + 'job_market.csv' if not os.path.isfile(res_file): termList1 = term_list( keyterms.textrank(corpus[0], normalize=u'lower', n_keyterms=30)) termList2 = term_list( keyterms.sgrank(corpus[0], ngrams=(1, 2), normalize=u'lower', window_width=100, n_keyterms=70, idf=None)) termSet1 = set(termList1) termSet2 = set(termList2) diffSet = termSet1 - termSet2 termList = termList2 + list(diffSet) save_terms_text(res_file, termList)
def preprocess_kags(in_path, out_path): import os spacy_lang = en_core_web_sm.load() for kag_path in glob.glob(KAG_BASE_PATH + '/*'): _, kag_name = os.path.split(kag_path) corpus = textacy.corpus.Corpus(spacy_lang) texts = [] for comp_path in glob.glob(kag_path + '/*'): for filename in glob.glob(comp_path + '/*.txt'): texts.append(open(filename, 'r').read().decode('utf-8')) corpus_text = '\n'.join(text for text in texts) corpus.add_text(corpus_text) # _ , comp_file = os.path.split(comp_path) # sindex = len(kag_name) + 1 # eindex = sindex + comp_file[sindex:].index('_') res_file = '{}.csv'.format(get_kag(kag_name)) doc_idf = corpus.word_doc_freqs(lemmatize=None, weighting='idf', lowercase=True, as_strings=True) termList = keyterms.sgrank(corpus[0], ngrams=(1, 2, 3), normalize=u'lower', window_width=500, n_keyterms=30, idf=doc_idf) save_terms_csv(out_path + res_file, termList)
def preprocess_ms_jobs(in_path, out_path): from docx import Document import os from googletrans import Translator spacy_lang = en_core_web_sm.load() _, res_name = os.path.split(out_path) wordDoc = Document(in_path) job_count = 0 for table in wordDoc.tables: desc = '' req = '' for row in table.rows: for cell in row.cells: if cell.text == "Functieomschrijving": desc = row.cells[1].text if cell.text == "Functie-eisen": req = row.cells[1].text if desc and req: corpus = textacy.corpus.Corpus(spacy_lang) corpus_text = '\n'.join(text for text in [desc, req]) translator = Translator() corpus_text_en = translator.translate(corpus_text, dest='en').text corpus_text_en = corpus_text_en.encode('ascii', 'ignore') corpus_text_en = corpus_text_en.decode('utf-8') corpus.add_text(corpus_text_en) termList = keyterms.sgrank(corpus[0], ngrams=(1, 2, 3), normalize=u'lower', n_keyterms=100) res_file = out_path + 'job{0}.csv'.format(job_count) save_terms_csv(res_file, termList) job_count += 1
def test_ngrams_1(self, spacy_doc): expected = ["friedman", "international", "beirut", "bureau", "york"] observed = [ term for term, _ in keyterms.sgrank(spacy_doc, ngrams=1, n_keyterms=5) ] assert len(expected) == len(observed)
def test_sgrank_ngrams_1(self): expected = ['friedman', 'international', 'beirut', 'bureau', 'york'] observed = [ term for term, _ in keyterms.sgrank( self.spacy_doc, ngrams=1, n_keyterms=5) ] self.assertEqual(len(expected), len(observed))
def test_sgrank_n_keyterms(spacy_doc): expected = [ 'new york times', 'new york times jerusalem bureau chief', 'friedman', 'president george h. w. bush', 'david k. shipler' ] observed = [term for term, _ in keyterms.sgrank(spacy_doc, n_keyterms=5)] assert len(expected) == len(observed)
def keywords(): #print request.get_json() arg = request.get_json() doc = textacy.Doc(arg['content'], metadata={'title': arg['title']}, lang=unicode('en_core_web_sm')) sgrank_keywords = dict(keyterms.sgrank(doc)) singlerank_keywords = dict(keyterms.singlerank(doc)) textrank_keywords = dict(keyterms.textrank(doc)) sgrank_keywords.update((x, y * 0.9) for x, y in sgrank_keywords.items()) textrank_keywords.update( (x, y * 0.05) for x, y in textrank_keywords.items()) singlerank_keywords.update( (x, y * 0.05) for x, y in singlerank_keywords.items()) keywords = res = dict( Counter(sgrank_keywords) + Counter(textrank_keywords) + Counter(singlerank_keywords)) sorted_keywords = sorted(keywords.items(), key=operator.itemgetter(1), reverse=True) keyword_string = "" for i, key in enumerate(sorted_keywords): if (i == int(len(sorted_keywords) / 2)): keyword_string = keyword_string + "||" if (i == len(sorted_keywords) - 1 or i == int(len(sorted_keywords) / 2) - 1): keyword_string = keyword_string + key[0] else: keyword_string = keyword_string + key[0] + ",," return keyword_string
def key_terms(self, algorithm='sgrank', n=10): """ Extract key terms from a document using `algorithm`. Args: algorithm (str {'sgrank', 'textrank', 'singlerank'}, optional): name of algorithm to use for key term extraction n (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0.0, 1.0), representing the fraction of top-ranked terms to return as keyterms Returns: list[(str, float)]: sorted list of top `n` key terms and their corresponding scores Raises: ValueError: if ``algorithm`` not in {'sgrank', 'textrank', 'singlerank'} .. seealso:: :func:`keyterms.sgrank() <textacy.keyterms.sgrank>` .. seealso:: :func:`keyterms.textrank() <textacy.keyterms.textrank>` .. seealso:: :func:`keyterms.singlerank() <textacy.keyterms.singlerank>` """ if algorithm == 'sgrank': return keyterms.sgrank(self.spacy_doc, window_width=1500, n_keyterms=n) elif algorithm == 'textrank': return keyterms.textrank(self.spacy_doc, n_keyterms=n) elif algorithm == 'singlerank': return keyterms.singlerank(self.spacy_doc, n_keyterms=n) else: raise ValueError('algorithm {} not a valid option'.format(algorithm))
def key_terms(self, algorithm='sgrank', n=10): """ Extract key terms from a document using `algorithm`. Args: algorithm (str {'sgrank', 'textrank', 'singlerank'}, optional): name of algorithm to use for key term extraction n (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0.0, 1.0), representing the fraction of top-ranked terms to return as keyterms Returns: list[(str, float)]: sorted list of top `n` key terms and their corresponding scores Raises: ValueError: if ``algorithm`` not in {'sgrank', 'textrank', 'singlerank'} .. seealso:: :func:`keyterms.sgrank() <textacy.keyterms.sgrank>` .. seealso:: :func:`keyterms.textrank() <textacy.keyterms.textrank>` .. seealso:: :func:`keyterms.singlerank() <textacy.keyterms.singlerank>` """ if algorithm == 'sgrank': return keyterms.sgrank(self.spacy_doc, window_width=1500, n_keyterms=n) elif algorithm == 'textrank': return keyterms.textrank(self.spacy_doc, n_keyterms=n) elif algorithm == 'singlerank': return keyterms.singlerank(self.spacy_doc, n_keyterms=n) else: raise ValueError( 'algorithm {} not a valid option'.format(algorithm))
def test_sgrank_norm_normalized_str(self): expected = [ 'New York Times', 'New York Times Jerusalem Bureau Chief', 'Friedman', 'President George H. W. Bush', 'George Polk Award'] observed = [ term for term, _ in keyterms.sgrank(self.spacy_doc, normalize=spacy_utils.normalized_str, n_keyterms=5)] self.assertEqual(len(expected), len(observed))
def test_sgrank_norm_none(spacy_doc): expected = [ 'New York Times', 'New York Times Jerusalem Bureau Chief', 'Friedman', 'President George H. W. Bush', 'George Polk Award'] observed = [ term for term, _ in keyterms.sgrank(spacy_doc, normalize=None, n_keyterms=5)] assert len(expected) == len(observed)
def test_sgrank_window_width(spacy_doc): expected = [ 'new york times', 'friedman', 'new york times jerusalem', 'times jerusalem bureau', 'second pulitzer prize'] observed = [ term for term, _ in keyterms.sgrank(spacy_doc, window_width=50, n_keyterms=5)] assert len(expected) == len(observed)
def test_sgrank(spacy_doc): expected = [ 'new york times', 'york times jerusalem bureau chief', 'friedman', 'president george h. w.', 'george polk award', 'pulitzer prize', 'u.s. national book award', 'international reporting', 'beirut', 'washington post'] observed = [term for term, _ in keyterms.sgrank(spacy_doc)] assert len(expected) == len(observed)
def test_n_keyterms(self, spacy_doc): expected = [ "new york times", "new york times jerusalem bureau chief", "friedman", "president george h. w. bush", "david k. shipler", ] observed = [ term for term, _ in keyterms.sgrank(spacy_doc, n_keyterms=5) ] assert len(expected) == len(observed) # can't do this owing to randomness of results # for e, o in zip(expected, observed): # asert e == o observed = [ term for term, _ in keyterms.sgrank(spacy_doc, n_keyterms=0.1) ] assert len(observed) > 0
def test_sgrank_ngrams_1_2_3(spacy_doc): expected = [ 'new york times', 'friedman', 'pulitzer prize', 'beirut', 'international reporting' ] observed = [ term for term, _ in keyterms.sgrank( spacy_doc, ngrams=(1, 2, 3), n_keyterms=5) ] assert len(expected) == len(observed)
def test_sgrank_n_keyterms(spacy_doc): expected = [ "new york times", "new york times jerusalem bureau chief", "friedman", "president george h. w. bush", "david k. shipler", ] observed = [term for term, _ in keyterms.sgrank(spacy_doc, n_keyterms=5)] assert len(expected) == len(observed)
def test_sgrank_norm_lower(self): expected = [ 'new york times', 'president george h. w. bush', 'friedman', 'new york times jerusalem bureau', 'george polk award'] observed = [ term for term, _ in keyterms.sgrank(self.spacy_doc, normalize='lower', n_keyterms=5)] self.assertEqual(len(expected), len(observed)) for term in observed: self.assertEqual(term, term.lower())
def test_sgrank_norm_lower(spacy_doc): expected = [ 'new york times', 'president george h. w. bush', 'friedman', 'new york times jerusalem bureau', 'george polk award'] observed = [ term for term, _ in keyterms.sgrank(spacy_doc, normalize='lower', n_keyterms=5)] assert len(expected) == len(observed) for term in observed: assert term == term.lower()
def _apply_keyterm_ranking(self, doc, params=None): if self.method == 'sgrank': keywords = textacy.keyterms.sgrank(doc, **params) \ if params else tck.sgrank(doc) elif self.method == 'textrank': keywords = textacy.keyterms.textrank(doc, **params) \ if params else tck.textrank(doc) elif self.method == 'singlerank': keywords = textacy.keyterms.singlerank(doc, **params) \ if params else tck.singlerank(doc) return keywords
def extract_keyterms(data): tokens = [] doc = Doc(data, lang="en_core_web_md") res = keyterms.sgrank(doc, n_keyterms=100) for r in res: tokens.append(str(r[0])) if len(tokens) == 0: tokens = ["empty"] return tokens
def test_sgrank_norm_none(spacy_doc): expected = [ "New York Times", "New York Times Jerusalem Bureau Chief", "Friedman", "President George H. W. Bush", "George Polk Award", ] observed = [ term for term, _ in keyterms.sgrank(spacy_doc, normalize=None, n_keyterms=5) ] assert len(expected) == len(observed)
def test_sgrank_window_width(spacy_doc): expected = [ "new york times", "friedman", "new york times jerusalem", "times jerusalem bureau", "second pulitzer prize", ] observed = [ term for term, _ in keyterms.sgrank(spacy_doc, window_width=50, n_keyterms=5) ] assert len(expected) == len(observed)
def test_sgrank_ngrams_1_2_3(spacy_doc): expected = [ "new york times", "friedman", "pulitzer prize", "beirut", "international reporting", ] observed = [ term for term, _ in keyterms.sgrank(spacy_doc, ngrams=(1, 2, 3), n_keyterms=5) ] assert len(expected) == len(observed)
def test_sgrank_norm_lower(spacy_doc): expected = [ "new york times", "president george h. w. bush", "friedman", "new york times jerusalem bureau", "george polk award", ] observed = [ term for term, _ in keyterms.sgrank(spacy_doc, normalize="lower", n_keyterms=5) ] assert len(expected) == len(observed) for term in observed: assert term == term.lower()
def preprocess_jobs_or_cvs(in_path, out_path): import os for filename in glob.glob(in_path + '*.json'): _, cv_file = os.path.split(filename) res_file = out_path + '{0}.csv'.format(cv_file[0:cv_file.index('.')]) if not os.path.isfile(res_file): corpus = read_cv2(filename) termList = keyterms.sgrank(corpus[0], ngrams=(1, 2), normalize=u'lower', window_width=500, n_keyterms=30, idf=None) save_terms_csv(res_file, termList)
def preprocess_jobs_or_cvs(in_path, out_path): import os job_count = 0 # limit number of jobs processed for filename in glob.glob(in_path + '*.json'): _, joc_file = os.path.split(filename) res_file = out_path + '{0}.csv'.format(joc_file[0:joc_file.index('.')]) if not os.path.isfile(res_file): corpus = read_cv2(filename) termList = keyterms.sgrank(corpus[0], normalize=u'lower', n_keyterms=100) save_terms_csv(res_file, termList) job_count += 1 if job_count >= 100: break
def test_base(self, spacy_doc): expected = [ "new york times", "york times jerusalem bureau chief", "friedman", "president george h. w.", "george polk award", "pulitzer prize", "u.s. national book award", "international reporting", "beirut", "washington post", ] observed = [term for term, _ in keyterms.sgrank(spacy_doc)] assert len(expected) == len(observed)
def get_keyphrases_sgrank(text, idfs): doc = make_spacy_doc(bioclean_mod(text), lang='en') keyphrases = keyterms.sgrank( doc, ngrams = tuple(range(1, 4)), normalize = None, # None, # u'lemma', # u'lower' window_width = 50, n_keyterms = 5, idf = None, include_pos = ("NOUN", "PROPN", "ADJ"), # ("NOUN", "PROPN", "ADJ"), # ("NOUN", "PROPN", "ADJ", "VERB", "CCONJ"), ) if(len(keyphrases)==0): # print([(tok, idfs[tok] if tok in idfs else max_idf) for tok in doc if tok.pos=='NOUN']) toks_with_idfs = [(tok, idfs[tok] if tok in idfs else max_idf) for tok in doc] toks_with_idfs = sorted(toks_with_idfs, key=lambda x: x[1]) keyphrases = [(tt[0].text, tt[1]) for tt in toks_with_idfs] # return text, keyphrases return keyphrases
def preprocess_category(in_path, out_path, category_name): import os spacy_lang = en_core_web_sm.load() print('===GIVEN CATEGORY: ' + category_name) for cat_path in glob.glob(in_path + '*'): _, cat_name = os.path.split(cat_path) print('===CATEGORY: ' + cat_name) if category_name == cat_name: print('###Fine, found category directory ...') # for comp_path in glob.glob(kag_path + '/*'): corpus = textacy.corpus.Corpus(spacy_lang) texts = [] for filename in glob.glob(cat_path + '/*.txt'): texts.append(open(filename, 'r').read().decode('utf-8')) corpus_text = '\n'.join(text for text in texts) corpus.add_text(corpus_text) # _ , comp_file = os.path.split(comp_path) # sindex = len(kag_name) + 1 # eindex = sindex + comp_file[sindex:].index('_') # res_file = '{}.csv'.format(comp_file[sindex:eindex]) res_file = '{}.csv'.format(category_name.lower()) termList1 = term_list( keyterms.textrank(corpus[0], normalize=u'lower', n_keyterms=30)) doc_idf = corpus.word_doc_freqs(lemmatize=None, weighting='idf', lowercase=True, as_strings=True) termList2 = term_list( keyterms.sgrank(corpus[0], ngrams=(1, 2, 3), normalize=u'lower', window_width=500, n_keyterms=70, idf=doc_idf)) termSet1 = set(termList1) termSet2 = set(termList2) diffSet = termSet1 - termSet2 termList = termList2 + list(diffSet) save_terms_text(out_path + res_file, termList) break
def preprocess_competences_combined(in_path, out_path): import os spacy_lang = en_core_web_sm.load() for kag_path in glob.glob(KAG_BASE_PATH + '/*'): _, kag_name = os.path.split(kag_path) for comp_path in glob.glob(kag_path + '/*'): corpus = textacy.corpus.Corpus(spacy_lang) texts = [] for filename in glob.glob(comp_path + '/*.txt'): # content = open(filename, 'r').read().decode('utf-8') # testing preprocess # clean_text = preprocess_text(content, no_punct=True, no_contractions=True, no_accents=True) # texts.append(clean_text) texts.append(open(filename, 'r').read().decode('utf-8')) corpus_text = '\n'.join(text for text in texts) corpus.add_text(corpus_text) _, comp_file = os.path.split(comp_path) sindex = len(kag_name) + 1 eindex = sindex + comp_file[sindex:].index('_') res_file = '{}.csv'.format(comp_file[sindex:eindex]) termList1 = term_list( keyterms.textrank(corpus[0], normalize=u'lower', n_keyterms=30)) doc_idf = corpus.word_doc_freqs(lemmatize=None, weighting='idf', lowercase=True, as_strings=True) termList2 = term_list( keyterms.sgrank(corpus[0], ngrams=(1, 2, 3), normalize=u'lower', window_width=500, n_keyterms=70, idf=doc_idf)) termSet1 = set(termList1) termSet2 = set(termList2) diffSet = termSet1 - termSet2 termList = termList2 + list(diffSet) save_terms_text(out_path + res_file, termList)
def preprocess_competences2(in_path, out_path): import os spacy_lang = en_core_web_sm.load() for kag_path in glob.glob(in_path + '/*'): _, kag_name = os.path.split(kag_path) print('===KAG: ' + kag_name) for filename in glob.glob(kag_path + '/*.txt'): _, comp_file = os.path.split(filename) print('===competence file: ' + comp_file) if comp_file.index('.') >= 5: print('===preprocessing competence file: ' + comp_file) corpus = textacy.corpus.Corpus(spacy_lang) corpus.add_text(open(filename, 'r').read().decode('utf-8')) # doc_idf = corpus.word_doc_freqs(lemmatize=None, weighting='idf', lowercase=True, as_strings=True) termList = term_list( keyterms.sgrank(corpus[0], ngrams=(1, 2, 3), normalize=u'lower', idf=None)) res_file = '{}.csv'.format(comp_file[:-4]) print('===Writing to: ' + res_file) save_terms_text(out_path + res_file, termList)