import urllib2 input_json_path = "./input/gt_data.json" json_data = {} with open(input_json_path) as data_file: input_data = json.load(data_file) for data in input_data: op_json = {} html = urllib2.urlopen(urllib2.Request(data['id'], headers={'User-Agent' : "Magic Browser"}) ).read() readable_article = Document(html).summary(True) writable_tag_data = readable_article.encode('utf-8') #Save processed html files f = open('./data_op_readability/file_4_'+str(data['uid'])+".html",'w') f.write(writable_tag_data) f.close() #get pure content doc = BeautifulSoup(writable_tag_data,'lxml') full_text = doc.get_text().encode('utf-8') processed_data = re.sub( '\s+', ' ', full_text).strip() op_json['id'],op_json['uid'],op_json['content'] = data['id'],data['uid'],processed_data json_data[data['uid']] = op_json with open('./output_readability_txt/processed_data_readability_'+str(int(time.time()))+'.json', 'w') as outfile: json.dump(json_data, outfile)
"http://losangeles.backpage.com/FemaleEscorts/open-minded-sexy-brunette-janet-outcall/33471446" ] html = urllib.urlopen("http://www.eroticmugshots.com/ftlauderdale-escorts/954-601-7752/?pid=36770728").read() readable_article = Document(html).summary(True) print readable_article sys.exit() i = 21 for urlex in arr: html = urllib.urlopen(urlex).read() readable_article = Document(html).summary(True) i += 1 tags = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>' data = readable_article.encode('utf-8') tags_end = "</body></html>" # soup = BeautifulSoup(data,"lxml") # metatag = soup.new_tag('head') # soup.html.insert(0,metatag) # metatag = soup.new_tag('meta') # metatag.attrs['http-equiv'] = 'Content-Type' # metatag.attrs['content'] = 'text/html; charset=utf-8' # soup.head.append(metatag) # print soup.prettify() # #print soup.contents # break f = open('fileHtml'+str(i)+".html",'w')