Exemplo n.º 1
0
upper_half = json.load(open('upper_half.json','rb'))
lower_half = json.load(open('lower_half.json','rb'))

comments = list(csv.DictReader(open('comments.csv','rb')))

comments = [comment for comment in comments 
		if comment['Student Comment'] != 'None' and comment['Student Comment'] !='NA']

upper_half_comments = [comment for comment in comments if comment['Name'] in upper_half.keys()]
lower_half_comments = [comment for comment in comments if comment['Name'] in lower_half.keys()]

ap('Upper len :%d'%len(upper_half_comments))
ap('Lower len :%d'%len(lower_half_comments))

upper_half_vocabulary  =' '.join(tech.cleanse(' '.join([comment['Student Comment'] for comment in upper_half_comments])))
lower_half_vocabulary = ' '.join(tech.cleanse(' '.join([comment['Student Comment'] for comment in lower_half_comments])))

upper_half_words = [word.lower() for word in nltk.word_tokenize(to_ascii(upper_half_vocabulary)) 
					if word not in punkt and word not in stopwords]
lower_half_words = [word.lower() for word in nltk.word_tokenize(to_ascii(lower_half_vocabulary)) 
					if word not in punkt and word not in stopwords]

upper_freqs = nltk.FreqDist(upper_half_words)
lower_freqs = nltk.FreqDist(lower_half_words)

print tech.weighted_jaccard_similarity(upper_freqs,lower_freqs)

fig,axs = plt.subplots(ncols=2)
for ax,data,label in zip(axs,[upper_freqs,lower_freqs],['Completers','Non-completers']):
	words,freqs = zip(*data.most_common(20))
Exemplo n.º 2
0
base_path = '/Volumes/My Book/twittwer-stream/control'

'''
data = []
for filename in os.listdir(base_path):
	with open(os.path.join(base_path,filename),'rb') as fid:
		data  += [json.load(fid)]

json.dump(data,open('/Volumes/My Book/twittwer-stream/amalgamated.json','wb'))
#--1 Classify
'''

data = json.load(open(os.path.join('/Volumes/My Book/twittwer-stream','amalgamated.json'),'rb'))
#data = json.load(open('control_tweets.json','rb'))

text = tech.cleanse([tweet['text'] for tweet in data])

#Why duplicating one tweet from test corpus?
classifications = {}

def iqr(data):
	try:
		return 0.5*(np.percentile(data,75) - np.percentile(data,25))
	except:
		print data
def get(lst,field):
	return [item[field] for item in lst]

for i,tweet in enumerate(text):
	if langid.classify(' '.join(tweet))[0] == 'en':
	  	tweet,usernames,hashtags =  tech.extract_tokens(tweet)
import numpy as np 

from awesome_print import ap 
from nltk.util import ngrams
from matplotlib import rcParams

rcParams['text.usetex'] = True

data = list(csv.DictReader(open('comments.csv','rb')))
categories = ['Reporter','Interpreter','Manager','Superior']

data_by_category = {}
for category in categories:
	data_by_category[category] = {}
	comments = ' '.join([student['Student Comment'] for student in data if student['Physician Comment']==category])
	data_by_category[category]['comments'] = tech.cleanse(comments)
	data_by_category[category]['fdist'] = nltk.FreqDist(data_by_category[category]['comments'])
	tech.save_ngrams(data_by_category[category]['fdist'].most_common(50),filename='comments-%s'%category.lower())
	
	data_by_category[category]['bigram.fdist'] = nltk.FreqDist(ngrams(data_by_category[category]['comments'],2))
	tech.save_ngrams(data_by_category[category]['bigram.fdist'].most_common(50),filename='comments-%s'%category.lower())

	data_by_category[category]['count.comments'] = len(comments)
	data_by_category[category]['count.students'] = len([student['Student Comment'] 
					for student in data if student['Physician Comment']==category])

jmat = np.array([[tech.jaccard_similarity(data_by_category[one]['comments'],data_by_category[two]['comments'])
				for one in categories]
				for two in categories])
	
np.savetxt('calculated-jaccard-similarity.tsv',jmat,delimiter='\t',fmt='%.04f')
Exemplo n.º 4
0
import string, csv, itertools, nltk

import Graphics as artist 
import utils as tech

from nltk.util import ngrams
from awesome_print import ap 

data = list(csv.DictReader(open('comments.csv','rb')))
text = tech.cleanse(' '.join(itertools.chain(record['Student Comment'] for record in data)))

tech.savelines(text,filename='all-words-cleansed')

fdist = nltk.FreqDist(text)
tech.savelines(zip(*fdist.most_common(100)),filename='overall-frequencies-cleansed')

bigram_fdist = nltk.FreqDist(ngrams(text,2))
tech.savelines(zip(*bigram_fdist.most_common(100)),filename='bigram-frequencies-cleansed')

artist.frequency_plot(fdist,filename='overall-frequency-distribution')
Exemplo n.º 5
0
import os, nltk, csv

import utils as tech
import matplotlib.pyplot as plt 

CASE = os.path.join(os.getcwd(),'data','case')

with open(os.path.join(CASE,'combined-deduplicated-rated.csv'),'r') as infile:
	items = [row for row in csv.reader(infile)]


TEXT = 0
RATING = 2
text = tech.cleanse([item[0] for item in items])
tokens = [word for tweet in text for word in tweet]

with open('rule-in-tokens.txt','wb') as outfile:
	for token in set(tokens):
		print>>outfile,token

word_frequencies = nltk.FreqDist(tokens)

fig = plt.figure()
ax = fig.add_subplot(111)
words,freqs = zip(*word_frequencies.most_common(25))
ax.plot(freqs,'k--',linewidth=2) 
tech.adjust_spines(ax)
ax.set_xticks(range(len(words)))
ax.set_xticklabels(words,rotation='vertical',weight='bold')
ax.set_ylabel('Count')
plt.tight_layout()
Exemplo n.º 6
0
IRRELEVANT_SPECIFIC_SYNSETS = {synset for token in IRRELEVANT_SPECIFIC for synset in wn.synsets(token)}

def assign_category(list_of_tokens,threshold=75):
	return np.percentile()

def score(list_of_tokens):
	if len(list_of_tokens) > 0:
		list_of_tokens = set(list_of_tokens)
		return len(RELEVANT_SPECIFIC & list_of_tokens)/float(len(list_of_tokens))
	else:
		return np.nan
#Don't forget to do the controls

## Could look for words sufficiently semantically similar to the tokens, first see if direct similarity works

test_corpus = tech.cleanse(set([item.split('|')[0] for item in open('test-high-prevalence.txt','r').read().splitlines()]))
#random.shuffle(test_corpus)

#print len(test_corpus)
words = set(nltk.word_tokenize(' '.join(itertools.chain.from_iterable(test_corpus))))
omitted = []
senses = []
irrelevant = []
for word in words:
	print word
	if len(wn.synsets(word)) > 0:
		senses += [len(set(wn.synsets(word)) & RELEVANT_SPECIFIC_SYNSETS)/float(len(wn.synsets(word)))]
		irrelevant += [len(set(wn.synsets(word)) & IRRELEVANT_SPECIFIC_SYNSETS)/float(len(wn.synsets(word)))]
		
	else:
		omitted += [word]	
Exemplo n.º 7
0
	if len(one & two) == 0:
		return 0 
	else:
		return len(one & two)/float(len(one | two))

TEXT = 0
RATING = 2
with open(os.path.join(CASE,'combined-deduplicated-rated.csv'),'r') as infile:
	items = [row for row in csv.reader(infile)]

relevant, irrelevant  = [], []

for item in items:
	relevant.append(item[TEXT]) if int(item[RATING]) == 1 else irrelevant.append(item[TEXT])

relevant = list(itertools.chain.from_iterable(tech.cleanse(relevant)))
relevant += ['purple']
irrelevant =  list(itertools.chain.from_iterable(tech.cleanse(irrelevant)))

'''
Numerator 527
Denominator 3832
0.13752609603340293
'''


for key,value in [('relevant',relevant),('irrelevant',irrelevant)]:
	with open('%s-tokens'%key,'w') as outfile:
		for token in value:
			print>>outfile,token
Exemplo n.º 8
0
def extract_grade(student):
	return {'name':student['name'].split(',')[0].capitalize(),
			'grade':student['grade']}

ratings = [extract_grade(student) for student in list(csv.DictReader(open('data.csv','rb')))]

data = list(csv.DictReader(open('comments.csv','rb')))
data = [student for student in data if student['Physician Comment']!='NA'
	and student['Physician Comment'] !='Cedar' and student['Physician Comment'] != 'X']

rating_names = [student['name'] for student in ratings]
data_names = list(set([student['Name'] for student in data]))
#cleans text for classifying
for i,student in enumerate(data):
	text = tech.cleanse(student['Student Comment'])
	data[i]['Student Comment'] = text

#split into testing and training sets
n = len(data)
test_idx = random.sample(xrange(n),int(n*0.5))
train_idx = set(xrange(n))-set(test_idx)

test_set = filter(lambda item: item[1] ,map(extract_featurelabel,[data[i] for i in test_idx]))
train_set = filter(lambda item: item[1] ,map(extract_featurelabel,[data[i] for i in train_idx]))

#classifier = NaiveBayesClassifier.train(train_set)
classif.train(test_set)
#Compute accuracy
test_data,test_label = zip(*test_set)
train_data,train_label = zip(*train_set)
     	./data/source/keyword 
'''

#---LOAD DATA
if not os.path.isfile(CORPUS_FILENAME):
	#More expressive than itertools.product, small loops --> no important speed or memory difference
	corpus = {}
	for source in sources:
		corpus[source] = {}
		for disease in keywords:
				path = os.path.join(base,source,disease)
				text = ' '.join(open(os.path.join(path,filename),READ).read() for filename in os.listdir(path)
										if not os.path.isdir(os.path.join(path,filename)))
				text = text.replace('.',' ').replace("\n"," ")
				text = re.sub(r"[^\x00-\x7F]","",text) #Regexp faster than iterating through string to remove non-ASCII
				corpus[source][disease]  = list(tech.cleanse(text)) 
				#Cleanse returns type set. Type set is not JSON serializable. Type list is.
	json.dump(corpus,open(CORPUS_FILENAME,WRITE))

else:
	corpus = json.load(open(CORPUS_FILENAME,READ))


#--- CALCULATE JACCARD SIMILARITY

source_rubric = [[source for source in sources] 
						 for source in sources]


filenames = ['jaccard-similarity-%s'%disease for disease in keywords]
filenames += ['jaccard-similarities.json']
Exemplo n.º 10
0
synsets = {'positive' :{synset for token in informative_tokens['positive']['tokens'] for synset in wn.synsets(token)},
			'negative':{synset for token in informative_tokens['negative']['tokens'] for synset in wn.synsets(token)}}


data = [item.strip().split('|') for item in open('evaluation-rating','r').read().splitlines()]

tweets,my_ratings = zip(*[(item[0],int(item[2])) for item in data if len(item)>2])

positive, negative = [],[]

for i in xrange(len(tweets)):
	if langid.classify(tweets[i])[0] == 'en':
		positive.append(tweets[i]) if my_ratings[i] == 1 else negative.append(tweets[i])

positive,p_users = tech.extract_tokens([' '.join(tweet) for tweet in tech.cleanse(positive)])
negative,n_users = tech.extract_tokens([' '.join(tweet) for tweet in tech.cleanse(negative)])

tmp = informative_tokens['positive']['tokens']
tmp += list(positive)
del tmp

tmp = informative_tokens['negative']['tokens']
tmp += list(negative)
del tmp

tmp = informative_tokens['positive']['usernames']
tmp += list(p_users)
del tmp

tmp = informative_tokens['positive']['usernames']