def feature_is_idf_verb(article, headline): headline_pos = _pos_tag(headline.lower()) verbs = [w for w,n in headline_pos if n[0]=='V'] verb_stems = [stemmer(n) for n in verbs] best_verbs = sorted(verb_stems, key=lambda stem: _stem_idf(stem), reverse=True) text = article["plaintext"] first_sentences = text.split('.')[:3] for verb in best_verbs[:2]: for i, sentence in enumerate(first_sentences): s_l = sentence.lower() stems = [stemmer(word) for word in s_l.split()] if verb in stems: return i + 1 return 0
def feature_is_first_verb(article, headline): headline_pos = _pos_tag(headline.lower()) verbs = [w for w,n in headline_pos if n[0]=='V'] verb_stems = [stemmer(n) for n in verbs] if len(verb_stems) == 0: return 0 first_verb = verb_stems[0] text = article["plaintext"] first_sentences = text.split('.')[:3] for i, sentence in enumerate(first_sentences): s_l = sentence.lower() stems = [stemmer(word) for word in s_l.split()] if first_verb in stems: return i + 1 return 0
def count(data): total_counts = {} stem_to_word = {} # stem to {word: count} for i, article in enumerate(data): update_progress(i, len(data)) text = article.get('plaintext') for word in text.split(): clean_word = re.sub(r'[^a-zA-Z ]','', word).lower() stem = stemmer(clean_word) cur_counts = stem_to_word.get(stem,{}) cur_word_count = cur_counts.get(stem,0) cur_word_count += 1 cur_counts[clean_word] = cur_word_count stem_to_word[stem] = cur_counts total_counts[stem] = total_counts.get(stem,0) + 1 representative_tokens = {} for stem, counts in stem_to_word.iteritems(): repr = max(counts, key=lambda x: counts[x]) representative_tokens[stem] = repr with open(OUT_COUNTS, 'w') as outf: json.dump(total_counts, outf) with open(REPRESENTATIVES, 'w') as outf: json.dump(representative_tokens, outf)
def make_stem_counts(text): C = {} for word in text.split(): clean_word = re.sub(r'[^a-zA-Z ]','', word).lower() stem = stemmer(clean_word) C[stem] = C.get(stem, 0) + 1 return C
def create_ngram(file_list, source_directory, target_directory, stopwords, min_n, max_n, clean, stem): from collections import defaultdict from stemming.porter2 import stem as stemmer from nltk import ngrams punct = string.punctuation + string.digits os.chdir(source_directory) for file_name in file_list: long_gram= defaultdict(int) try: with open(file_name, 'rb') as f_in: text = f_in.read() if clean: text = ''.join(s for s in text if s not in punct).lower() text_list = re.split(r'[ \t\n\r]+', text) text_list = [word for word in text_list if word not in stopwords] if stem: text_list = [stemmer(word) for word in text_list] grams = [ngrams(text_list, gram_len) for gram_len in range(min_n, max_n + 1)] for gram in grams: for item in gram: key = list(item) key.extend(str(len(key))) long_gram['_'.join(key)] += 1 with open(target_directory + '/' + file_name.split('.')[0] + '_grams.csv','wb') as f_out: csvw = csv.writer(f_out) for k,v in long_gram.iteritems(): csvw.writerow([k, v]) except Exception: #do error handling here return Exception pass return True
def tokenize_rest(text): wnl = WordNetLemmatizer() st = LancasterStemmer() words = nltk.word_tokenize(text) postag = nltk.pos_tag(words) tokens = [] whfound=False for word in words: if word[0:2].lower() == 'wh' and not whfound: tokens.append({word.lower():'wh'}) whfound = True continue elem=wnl.lemmatize(word) stem = st.stem(elem) synd = wn.synsets(stem) if not synd: stem = stemmer(elem) synd = wn.synsets(stem) if not synd: stem = elem synd = wn.synsets(stem) dbelement=detect(stem) if dbelement: for every_elem in dbelement: tokens.append({word:every_elem}) print "\n Rest of possible Tokens" print tokens return tokens
def feature_is_idf_noun(article, headline): ''' Feature equals the sentence number from start in which the noun (top two headline nouns are taken with rank of idf) overlaps (subject). If no overlap then the feature is set to 0. If overlap then yes. ''' headline_pos = _pos_tag(headline.lower()) nouns = [w for w,n in headline_pos if n[0]=='N'] noun_stems = [stemmer(n) for n in nouns] best_nouns = sorted(noun_stems, key=lambda stem: _stem_idf(stem), reverse=True) text = article["plaintext"] first_sentences = text.split('.')[:3] for noun in best_nouns[:2]: for i, sentence in enumerate(first_sentences): s_l = sentence.lower() stems = [stemmer(word) for word in s_l.split()] if noun in stems: return i+1 return 0
def feature_is_first_noun(article, headline): ''' Returns sentence in which first noun of the headline appears in ''' if not headline: import pdb; pdb.set_trace() print headline headline_pos = _pos_tag(headline.lower()) nouns = [w for w,n in headline_pos if n[0]=='N'] noun_stems = [stemmer(n) for n in nouns] text = article["plaintext"] first_sentences = text.split('.')[:3] if len(noun_stems) == 0: return 0 first_stem = noun_stems[0] for i, sentence in enumerate(first_sentences): s_l = sentence.lower() stems = [stemmer(word) for word in s_l.split()] if first_stem in stems: return i + 1 return 0
def attributefunction(): wnl = WordNetLemmatizer() st = LancasterStemmer() for elem in attribute: final=[] fileop = open('atr_'+elem,'w') stem = st.stem(elem) synd = wn.synsets(stem) if not synd: stem = stemmer(elem) synd = wn.synsets(stem) if not synd: stem = wnl.lemmatize(elem) synd = wn.synsets(stem) for synset in synd: final.append(synset.lemma_names) pickle.dump(final,fileop) fileop.close()
def construct_inverse_document_matrix(data): stem_to_docs = {} for i, article in enumerate(data): update_progress(i, len(data)) text = article.get('plaintext') for word in text.split(): clean_word = re.sub(r'[^a-zA-Z ]','', word).lower() stem = stemmer(clean_word) cur_set = stem_to_docs.get(stem, set()) cur_set.add(i) stem_to_docs[stem] = cur_set stem_to_doccount = {} for stem, docs in stem_to_docs.iteritems(): stem_to_doccount[stem] = len(docs) with open(INV_DOC_COUNTS, 'w') as inf: json.dump(stem_to_doccount, inf)
def assign_headline_tfidf_total(article): article_text = article["plaintext"] stem_counts = make_stem_counts(article_text) ranked_headlines = [] for h,i,p in all_headlines: tfidf = 0 for word in h.split(): clean_word = re.sub(r'[^a-zA-Z ]','', word).lower() stem = stemmer(clean_word) stem_article_count = stem_counts.get(stem,0) if stem not in inv: continue if stem_article_count <= 0: continue stem_tfidf = math.log10(float(total_docs) / float(inv[stem])) * math.log10(stem_article_count) tfidf += stem_tfidf ranked_headlines += [(h, i, p, tfidf)] ranked_headlines.sort(key=lambda x: x[3], reverse=True) return (article, ranked_headlines[:60])
def create_ngram(file_list, source_directory, target_directory, stopwords, min_n, max_n, clean, stem): from collections import defaultdict from stemming.porter2 import stem as stemmer from nltk import ngrams punct = string.punctuation + string.digits os.chdir(source_directory) for file_name in file_list: long_gram = defaultdict(int) try: with open(file_name, 'rb') as f_in: text = f_in.read() if clean: text = ''.join(s for s in text if s not in punct).lower() text_list = re.split(r'[ \t\n\r]+', text) text_list = [word for word in text_list if word not in stopwords] if stem: text_list = [stemmer(word) for word in text_list] grams = [ ngrams(text_list, gram_len) for gram_len in range(min_n, max_n + 1) ] for gram in grams: for item in gram: key = list(item) key.extend(str(len(key))) long_gram['_'.join(key)] += 1 with open( target_directory + '/' + file_name.split('.')[0] + '_grams.csv', 'wb') as f_out: csvw = csv.writer(f_out) for k, v in long_gram.iteritems(): csvw.writerow([k, v]) except Exception: #do error handling here return Exception pass return True