#from urllib.request import urlopen, Request import urllib2 input_json_path = "./input/gt_data.json" json_data = {} with open(input_json_path) as data_file: input_data = json.load(data_file) for data in input_data: op_json = {} html = urllib2.urlopen(urllib2.Request(data['id'], headers={'User-Agent' : "Magic Browser"}) ).read() readable_article = Document(html).summary(True) writable_tag_data = readable_article.encode('utf-8') #Save processed html files f = open('./data_op_readability/file_4_'+str(data['uid'])+".html",'w') f.write(writable_tag_data) f.close() #get pure content doc = BeautifulSoup(writable_tag_data,'lxml') full_text = doc.get_text().encode('utf-8') processed_data = re.sub( '\s+', ' ', full_text).strip() op_json['id'],op_json['uid'],op_json['content'] = data['id'],data['uid'],processed_data json_data[data['uid']] = op_json with open('./output_readability_txt/processed_data_readability_'+str(int(time.time()))+'.json', 'w') as outfile:
"http://losangeles.backpage.com/FemaleEscorts/sexy-caramel-barbie-doll-cute-girl/63871582", "http://losangeles.backpage.com/FemaleEscorts/realy-new-japanese-young-girl-pretty-sweet-cozy-massage-services-6572276076/65033335", "http://losangeles.backpage.com/FemaleEscorts/morning-specials-beautiful-and-latina-come-see-a-and-juicy-girl-who-loves-handsome-men/65025069", "http://losangeles.backpage.com/FemaleEscorts/luxuryspa-lovelylatinas-40510-freeway/62359660", "http://losangeles.backpage.com/FemaleEscorts/sexy-belizean-godess-big-booty-caramel-all-r-e-l-come-get-your-fixutit/62497186", "http://losangeles.backpage.com/FemaleEscorts/sexy-belizean-godess-big-booty-caramel-all-r-e-l-come-get-your-fixutit/62268601", "http://losangeles.backpage.com/FemaleEscorts/b-g-213331o692-8oqv-lax/64991663", "http://losangeles.backpage.com/FemaleEscorts/i-can-come-to-yuh/65054337", "http://losangeles.backpage.com/FemaleEscorts/lax-incall-1oo-specials-h0t-asian-latina-mix/63277038", "http://losangeles.backpage.com/FemaleEscorts/way-2-hot-2-handle-foxxy-brazilian-looking-2-have-fun-fun-fun-and-play-play-play/49623772", "http://losangeles.backpage.com/FemaleEscorts/sexy-beauty-andso-hot/33317924", "http://losangeles.backpage.com/FemaleEscorts/open-minded-sexy-brunette-janet-outcall/33471446" ] html = urllib.urlopen("http://www.eroticmugshots.com/ftlauderdale-escorts/954-601-7752/?pid=36770728").read() readable_article = Document(html).summary(True) print readable_article sys.exit() i = 21 for urlex in arr: html = urllib.urlopen(urlex).read() readable_article = Document(html).summary(True) i += 1 tags = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>' data = readable_article.encode('utf-8') tags_end = "</body></html>" # soup = BeautifulSoup(data,"lxml") # metatag = soup.new_tag('head') # soup.html.insert(0,metatag)