import urllib2

input_json_path = "./input/gt_data.json"


json_data = {}

with open(input_json_path) as data_file:    
    input_data = json.load(data_file)

for data in input_data:
	op_json = {}

	html = urllib2.urlopen(urllib2.Request(data['id'], headers={'User-Agent' : "Magic Browser"}) ).read()
	readable_article = Document(html).summary(True)
	writable_tag_data = readable_article.encode('utf-8')

	#Save processed html files
	f = open('./data_op_readability/file_4_'+str(data['uid'])+".html",'w')
	f.write(writable_tag_data) 
	f.close()

	#get pure content
	doc = BeautifulSoup(writable_tag_data,'lxml')
	full_text = doc.get_text().encode('utf-8')
	processed_data = re.sub( '\s+', ' ', full_text).strip()
	op_json['id'],op_json['uid'],op_json['content'] = data['id'],data['uid'],processed_data
	json_data[data['uid']] = op_json

with open('./output_readability_txt/processed_data_readability_'+str(int(time.time()))+'.json', 'w') as outfile:
    json.dump(json_data, outfile)
	"http://losangeles.backpage.com/FemaleEscorts/open-minded-sexy-brunette-janet-outcall/33471446"
] 

html = urllib.urlopen("http://www.eroticmugshots.com/ftlauderdale-escorts/954-601-7752/?pid=36770728").read()
readable_article = Document(html).summary(True)
print readable_article
sys.exit()

i = 21
for urlex in arr:
	html = urllib.urlopen(urlex).read()
	readable_article = Document(html).summary(True)
	i += 1

	tags = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>'
	data = readable_article.encode('utf-8')
	tags_end = "</body></html>"
	# soup = BeautifulSoup(data,"lxml")
	# metatag = soup.new_tag('head')
	# soup.html.insert(0,metatag)

	# metatag = soup.new_tag('meta')
	# metatag.attrs['http-equiv'] = 'Content-Type'
	# metatag.attrs['content'] = 'text/html; charset=utf-8'
	# soup.head.append(metatag)

	# print soup.prettify()
	# #print soup.contents
	# break

	f = open('fileHtml'+str(i)+".html",'w')