def feature_is_idf_verb(article, headline):
	headline_pos = _pos_tag(headline.lower())
	verbs = [w for w,n in headline_pos if n[0]=='V']
	verb_stems = [stemmer(n) for n in verbs]
	best_verbs = sorted(verb_stems, key=lambda stem: _stem_idf(stem), reverse=True)
	text = article["plaintext"]
	first_sentences = text.split('.')[:3]
	for verb in best_verbs[:2]:
		for i, sentence in enumerate(first_sentences):
			s_l = sentence.lower()
			stems = [stemmer(word) for word in s_l.split()]
			if verb in stems:
				return i + 1
	return 0
def feature_is_first_verb(article, headline):
	headline_pos = _pos_tag(headline.lower())
	verbs = [w for w,n in headline_pos if n[0]=='V']
	verb_stems = [stemmer(n) for n in verbs]
	if len(verb_stems) == 0:
		return 0
	first_verb = verb_stems[0]
	text = article["plaintext"]
	first_sentences = text.split('.')[:3]
	for i, sentence in enumerate(first_sentences):
		s_l = sentence.lower()
		stems = [stemmer(word) for word in s_l.split()]
		if first_verb in stems:
			return i + 1
	return 0
def count(data):
	total_counts = {}
	stem_to_word = {} # stem to {word: count}
	for i, article in enumerate(data):
		update_progress(i, len(data))
		text = article.get('plaintext')
		for word in text.split():
			clean_word = re.sub(r'[^a-zA-Z ]','', word).lower()
			stem = stemmer(clean_word)
			cur_counts = stem_to_word.get(stem,{})
			cur_word_count = cur_counts.get(stem,0)
			cur_word_count += 1
			cur_counts[clean_word] = cur_word_count
			stem_to_word[stem] = cur_counts

			total_counts[stem] = total_counts.get(stem,0) + 1

	representative_tokens = {}
	for stem, counts in stem_to_word.iteritems():
		repr = max(counts, key=lambda x: counts[x])
		representative_tokens[stem] = repr

	with open(OUT_COUNTS, 'w') as outf:
		json.dump(total_counts, outf)
	with open(REPRESENTATIVES, 'w') as outf:
		json.dump(representative_tokens, outf)
def make_stem_counts(text):
	C = {}
	for word in text.split():
		clean_word = re.sub(r'[^a-zA-Z ]','', word).lower()
		stem = stemmer(clean_word)
		C[stem] = C.get(stem, 0) + 1
	return C
def create_ngram(file_list, source_directory, target_directory, stopwords, min_n, max_n, clean, stem):
    from collections import defaultdict
    from stemming.porter2 import stem as stemmer
    from nltk import ngrams
    punct = string.punctuation + string.digits
    os.chdir(source_directory)
    for file_name in file_list:
        long_gram= defaultdict(int)
        try:
            with open(file_name, 'rb') as f_in:
                text = f_in.read()
            if clean:
                text = ''.join(s for s in text if s not in punct).lower()
            text_list = re.split(r'[ \t\n\r]+', text)
            text_list = [word for word in text_list if word not in stopwords]
            if stem:
                text_list = [stemmer(word) for word in text_list]
            grams = [ngrams(text_list, gram_len) for gram_len in range(min_n, max_n + 1)]
            for gram in grams:
                for item in gram:
                    key = list(item)
                    key.extend(str(len(key)))
                    long_gram['_'.join(key)] += 1
            with open(target_directory + '/' + file_name.split('.')[0] + '_grams.csv','wb') as f_out:
                csvw = csv.writer(f_out)
                for k,v in long_gram.iteritems():
                    csvw.writerow([k, v])
        except Exception:
            #do error handling here
            return Exception
            pass
    return True
示例#6
0
def tokenize_rest(text):
    wnl =  WordNetLemmatizer()
    st = LancasterStemmer()
    words = nltk.word_tokenize(text)
    postag = nltk.pos_tag(words)
    
    tokens = []
    whfound=False
    for word in words:
        if word[0:2].lower() == 'wh' and not whfound:
            tokens.append({word.lower():'wh'})
            whfound = True
            continue
        elem=wnl.lemmatize(word)
        stem = st.stem(elem)
        synd = wn.synsets(stem)
        if not synd:
            stem = stemmer(elem)
            synd = wn.synsets(stem)
        if not synd:
            stem = elem
            synd = wn.synsets(stem)
        dbelement=detect(stem)
        if dbelement:
            for every_elem in dbelement:
                tokens.append({word:every_elem})
    print "\n Rest of possible Tokens"
    print tokens
    return tokens
def feature_is_idf_noun(article, headline):
	''' 
	Feature equals the sentence number from start in which the noun (top two headline nouns are taken with rank of idf) overlaps (subject).
	If no overlap then the feature is set to 0. If overlap then yes. 
	'''
	headline_pos = _pos_tag(headline.lower())
	nouns = [w for w,n in headline_pos if n[0]=='N']
	noun_stems = [stemmer(n) for n in nouns]
	best_nouns = sorted(noun_stems, key=lambda stem: _stem_idf(stem), reverse=True)
	text = article["plaintext"]
	first_sentences = text.split('.')[:3]
	for noun in best_nouns[:2]:
		for i, sentence in enumerate(first_sentences):
			s_l = sentence.lower()
			stems = [stemmer(word) for word in s_l.split()]
			if noun in stems:
				return i+1
	return 0
def feature_is_first_noun(article, headline):
	'''
	Returns sentence in which first noun of the headline appears in
	'''
	if not headline:
		import pdb; pdb.set_trace()
	print headline
	headline_pos = _pos_tag(headline.lower())
	nouns = [w for w,n in headline_pos if n[0]=='N']
	noun_stems = [stemmer(n) for n in nouns]
	text = article["plaintext"]
	first_sentences = text.split('.')[:3]
	if len(noun_stems) == 0:
		return 0
	first_stem = noun_stems[0]

	for i, sentence in enumerate(first_sentences):
		s_l = sentence.lower()
		stems = [stemmer(word) for word in s_l.split()]
		if first_stem in stems:
			return i + 1
	return 0
示例#9
0
def attributefunction():
    wnl =  WordNetLemmatizer()
    st = LancasterStemmer()
    for elem in attribute:
        final=[]
        fileop = open('atr_'+elem,'w')
        stem = st.stem(elem)
        synd = wn.synsets(stem)
        if not synd:
            stem = stemmer(elem)
            synd = wn.synsets(stem)
        if not synd:
            stem = wnl.lemmatize(elem)
            synd = wn.synsets(stem)
        for synset in synd:
            final.append(synset.lemma_names)
        pickle.dump(final,fileop)
        fileop.close()
def construct_inverse_document_matrix(data):
	stem_to_docs = {}
	for i, article in enumerate(data):
		update_progress(i, len(data))
		text = article.get('plaintext')
		for word in text.split():
			clean_word = re.sub(r'[^a-zA-Z ]','', word).lower()
			stem = stemmer(clean_word)
			cur_set = stem_to_docs.get(stem, set())
			cur_set.add(i)
			stem_to_docs[stem] = cur_set

	stem_to_doccount = {}
	for stem, docs in stem_to_docs.iteritems():
		stem_to_doccount[stem] = len(docs)

	with open(INV_DOC_COUNTS, 'w') as inf:
		json.dump(stem_to_doccount, inf)
def assign_headline_tfidf_total(article):
	article_text = article["plaintext"]
	stem_counts = make_stem_counts(article_text)
	ranked_headlines = []
	for h,i,p in all_headlines:

		tfidf = 0
		for word in h.split():
			clean_word = re.sub(r'[^a-zA-Z ]','', word).lower()
			stem = stemmer(clean_word)
			stem_article_count = stem_counts.get(stem,0)
			if stem not in inv:
				continue
			if stem_article_count <= 0:
				continue
			stem_tfidf = math.log10(float(total_docs) / float(inv[stem])) * math.log10(stem_article_count)
			tfidf += stem_tfidf
		ranked_headlines += [(h, i, p, tfidf)]
	ranked_headlines.sort(key=lambda x: x[3], reverse=True)
	return (article, ranked_headlines[:60])
示例#12
0
def create_ngram(file_list, source_directory, target_directory, stopwords,
                 min_n, max_n, clean, stem):
    from collections import defaultdict
    from stemming.porter2 import stem as stemmer
    from nltk import ngrams
    punct = string.punctuation + string.digits
    os.chdir(source_directory)
    for file_name in file_list:
        long_gram = defaultdict(int)
        try:
            with open(file_name, 'rb') as f_in:
                text = f_in.read()
            if clean:
                text = ''.join(s for s in text if s not in punct).lower()
            text_list = re.split(r'[ \t\n\r]+', text)
            text_list = [word for word in text_list if word not in stopwords]
            if stem:
                text_list = [stemmer(word) for word in text_list]
            grams = [
                ngrams(text_list, gram_len)
                for gram_len in range(min_n, max_n + 1)
            ]
            for gram in grams:
                for item in gram:
                    key = list(item)
                    key.extend(str(len(key)))
                    long_gram['_'.join(key)] += 1
            with open(
                    target_directory + '/' + file_name.split('.')[0] +
                    '_grams.csv', 'wb') as f_out:
                csvw = csv.writer(f_out)
                for k, v in long_gram.iteritems():
                    csvw.writerow([k, v])
        except Exception:
            #do error handling here
            return Exception
            pass
    return True