def write_header_comment_file(self): if self.header_written: return comment_file = 'comment_' + self.outfile wrtr_file = open(comment_file,'ab') wrtr = UnicodeWriter(wrtr_file) wrtr.writerow([u'Language', u'Url', u'Published', u'Country', u'ThreadId', u'Inserted', u'PostSize', u'Subject', u'Text']) wrtr_file.close() self.header_written = True
def __init__(self, key, outfile, filter_thread): self.url = 'http://api.boardreader.com/v1/Blogs/Thread' self.key = key self.outfile = outfile self.filter_thread = filter_thread self.response = requests.get(self.url, {'key': self.key, 'rt': 'json', 'filter_thread' :self.filter_thread}) wrtr_file = open(self.outfile,'ab') wrtr = UnicodeWriter(wrtr_file) for item in self.response.json()['response']['Matches']['Match']: text = re.sub('[\,\n\t\b-]','',item['Text']) text = text.replace(',','') wrtr.writerow([item[u'Language'], item[u'Url'], item[u'Published'], item['Country'], item[u'ThreadId'], item[u'Inserted'], str(item['PostSize']), item['Subject'].replace(',',''), text]) wrtr_file.close()
def save_response(self): while self.query_limit > 0: self.filter_inserted_to = int(time.mktime(time.strptime(self.response['Matches']['Match'][-1]['Published'],"%Y-%m-%d %H:%M:%S"))) self.response['Matches']['Match'].extend(self.get_response()['Matches']['Match']) wrtr_file = open(self.outfile,'wb') wrtr = UnicodeWriter(wrtr_file) wrtr.writerow([u'Language', u'Url', u'Published', u'Country', u'ThreadId', u'Inserted', u'PostSize', u'Subject', u'Text']) for item in self.response['Matches']['Match']: text = re.sub('[\,\n\t\b-]','',item['Text']) text = text.replace(',','') wrtr.writerow([item[u'Language'], item[u'Url'], item[u'Published'], item['Country'], item[u'ThreadId'], item[u'Inserted'], str(item['PostSize']), item['Subject'].replace(',',''), text]) wrtr_file.close() for item in self.response['Matches']['Match']: if item[u'CommentsInThread'] > 0: self.write_header_comment_file() Comment(key=self.key, outfile='comment_' + self.outfile, filter_thread = item[u'ThreadId'])
def main(): """docstring""" LOG.debug('Started') with open(HTML_DIR + CDC_FILENAME, 'rU') as csv_file: LOG.info("Reading URLs from %s", HTML_DIR+CDC_FILENAME) reader = UnicodeReader(csv_file) # skip the header reader.next() for rec in reader: state = rec[STATE_INDEX] filename = state.replace(' ', '_') url = rec[URL_INDEX] LOG.info("Processing %s", state) # retrieve the webpage data webpage_data = get_info_on_webpage(url) # get the addresses from the local csv files csv_addresses = get_addresses_from_csv(CSV_DIR + filename + ".csv") unique_in_webpage = [rec for rec in webpage_data if normalize_address(rec[ADDRESS_INDEX]) not in csv_addresses] unique_in_webpage = sorted(unique_in_webpage, key=operator.itemgetter(CITY_INDEX)) try: with open(OUTPUT_DIR + filename + ".csv", "wb") as csv_f: writer = UnicodeWriter(csv_f) writer.writerows(unique_in_webpage) except UnicodeEncodeError as error: print "UnicodeEncode Error" LOG.debug('Finished')
if phone_info is None: continue phone = phone_info.text address = info.find('address').find('span').text wrtr.writerow([city,subcategory,name,phone,address]) #print name, phone, address, city, subcategory if data.find('ul','pager'): if data.find('ul','pager').find('li','next'): page = page + 1 writerecord(base_url, subcategory,city, page) if __name__ == '__main__': base_url = 'http://yellowpages.sulekha.com/clothing-accessories_delhi_clistings' cities = ['chennai'] outfile = open('chennai_sulekha1.csv','wb') wrtr = UnicodeWriter(outfile,delimiter=';') wrtr.writerow(['City','Category','Name','Phone','Address']) for city in cities: url = base_url.replace('delhi',city) next_iter = True links = BeautifulSoup(urlopen(url),'html.parser') for line in links.find('ol','business-clisting').findAll('div','blockTitle'): new_url = line.find('a')['href'] if new_url == 'http://yellowpages.sulekha.com/tie-manufacturers_chennai': next_iter = False if next_iter: continue incity = '_' + city subcategory = new_url.split('/')[-1].replace('-',' ').replace(incity,'') writerecord(new_url,subcategory,city) outfile.close()
data = BeautifulSoup(urlopen(url),"html.parser") #print url for item in data.findAll('div','card'): if (item.find('div','name') == None): break name = item.find('div','name').text.strip() place = item.find('div','place').text.strip() phone = item.find('a','mob-link').text.strip() if len(phone) == 0: continue wrtr.writerow([category,name,place,phone]) def locations(city_file): in_file = open(city_file) reader = csv.reader(in_file) for line in reader: yield (line[0], line[1]) if __name__ == '__main__': base_url = 'https://www.askme.com/search?q=' outfile = open('askme_mumbai.csv','w') categories = map(lambda x: x.strip(),open('categories').readlines()) wrtr = UnicodeWriter(outfile) wrtr.writerow(['category','name','address','phone']) for locality,city in locations('mumbai_localities.csv'): loc = locality.replace('.','').replace('-','').replace(' ','+') for category in categories: cat = category.strip().replace(' ','+') + '+' url = base_url+cat+'in+'+loc+'&type=outlets&city='+city writerecord(url,wrtr,category) outfile.close()
def file_Write(self,outputList): with open('translated.csv', 'wb') as files: writer = UnicodeWriter(files,quoting=csv.QUOTE_ALL) writer.writerows(outputList)