示例#1
0
 def write_header_comment_file(self):
     if self.header_written: return
     comment_file = 'comment_' + self.outfile
     wrtr_file = open(comment_file,'ab')
     wrtr = UnicodeWriter(wrtr_file)
     wrtr.writerow([u'Language', u'Url', u'Published', u'Country', u'ThreadId', u'Inserted', u'PostSize',
     u'Subject', u'Text'])
     wrtr_file.close()
     self.header_written = True
示例#2
0
 def __init__(self, key, outfile, filter_thread):
     self.url = 'http://api.boardreader.com/v1/Blogs/Thread'
     self.key = key
     self.outfile = outfile
     self.filter_thread = filter_thread
     self.response  = requests.get(self.url, {'key': self.key, 'rt': 'json', 'filter_thread' :self.filter_thread})
     wrtr_file = open(self.outfile,'ab')
     wrtr = UnicodeWriter(wrtr_file)
     for item in self.response.json()['response']['Matches']['Match']:
         text = re.sub('[\,\n\t\b-]','',item['Text'])
         text = text.replace(',','')
         wrtr.writerow([item[u'Language'], item[u'Url'], item[u'Published'], item['Country'],
             item[u'ThreadId'], item[u'Inserted'], str(item['PostSize']), item['Subject'].replace(',',''), text])
     wrtr_file.close()
示例#3
0
 def save_response(self):
     while self.query_limit > 0:
         self.filter_inserted_to = int(time.mktime(time.strptime(self.response['Matches']['Match'][-1]['Published'],"%Y-%m-%d %H:%M:%S")))
         self.response['Matches']['Match'].extend(self.get_response()['Matches']['Match'])
     wrtr_file = open(self.outfile,'wb')
     wrtr = UnicodeWriter(wrtr_file)
     wrtr.writerow([u'Language', u'Url', u'Published', u'Country', u'ThreadId', u'Inserted', u'PostSize',
     u'Subject', u'Text'])
     for item in self.response['Matches']['Match']:
         text = re.sub('[\,\n\t\b-]','',item['Text'])
         text = text.replace(',','')
         wrtr.writerow([item[u'Language'], item[u'Url'], item[u'Published'], item['Country'],
             item[u'ThreadId'], item[u'Inserted'], str(item['PostSize']), item['Subject'].replace(',',''), text])
     wrtr_file.close()
     for item in self.response['Matches']['Match']:
         if item[u'CommentsInThread'] > 0:
             self.write_header_comment_file()
             Comment(key=self.key, outfile='comment_' + self.outfile, filter_thread = item[u'ThreadId'])
示例#4
0
def main():
    """docstring"""
    LOG.debug('Started')

    with open(HTML_DIR + CDC_FILENAME, 'rU') as csv_file:
        LOG.info("Reading URLs from %s", HTML_DIR+CDC_FILENAME)
        reader = UnicodeReader(csv_file)

        # skip the header
        reader.next()
        for rec in reader:
            state = rec[STATE_INDEX]
            filename = state.replace(' ', '_')

            url = rec[URL_INDEX]
            LOG.info("Processing %s", state)

            # retrieve the webpage data
            webpage_data = get_info_on_webpage(url)

            # get the addresses from the local csv files
            csv_addresses = get_addresses_from_csv(CSV_DIR + filename + ".csv")

            unique_in_webpage = [rec for rec in webpage_data
                                 if normalize_address(rec[ADDRESS_INDEX])
                                 not in csv_addresses]

            unique_in_webpage = sorted(unique_in_webpage,
                                       key=operator.itemgetter(CITY_INDEX))

            try:
                with open(OUTPUT_DIR + filename + ".csv", "wb") as csv_f:
                    writer = UnicodeWriter(csv_f)
                    writer.writerows(unique_in_webpage)
            except UnicodeEncodeError as error:
                print "UnicodeEncode Error"
    LOG.debug('Finished')
示例#5
0
        if phone_info is None: continue
        phone = phone_info.text
        address = info.find('address').find('span').text
        wrtr.writerow([city,subcategory,name,phone,address])
        #print name, phone, address, city, subcategory
    if data.find('ul','pager'):
        if data.find('ul','pager').find('li','next'):
            page = page + 1
            writerecord(base_url, subcategory,city, page)


if __name__ == '__main__':
    base_url = 'http://yellowpages.sulekha.com/clothing-accessories_delhi_clistings'
    cities = ['chennai']
    outfile = open('chennai_sulekha1.csv','wb')
    wrtr = UnicodeWriter(outfile,delimiter=';')
    wrtr.writerow(['City','Category','Name','Phone','Address'])
    for city in cities:
        url = base_url.replace('delhi',city)
        next_iter = True
        links = BeautifulSoup(urlopen(url),'html.parser')
        for line in links.find('ol','business-clisting').findAll('div','blockTitle'):
            new_url = line.find('a')['href']
            if new_url == 'http://yellowpages.sulekha.com/tie-manufacturers_chennai':
                next_iter = False
            if next_iter: continue
            incity = '_' + city
            subcategory = new_url.split('/')[-1].replace('-',' ').replace(incity,'')
            writerecord(new_url,subcategory,city)
    outfile.close()
示例#6
0
    data = BeautifulSoup(urlopen(url),"html.parser")
    #print url
    for item in data.findAll('div','card'):
        if (item.find('div','name') == None): break
        name = item.find('div','name').text.strip()
        place = item.find('div','place').text.strip()
        phone = item.find('a','mob-link').text.strip()
        if len(phone) == 0: continue
        wrtr.writerow([category,name,place,phone])

def locations(city_file):
    in_file = open(city_file)
    reader = csv.reader(in_file)
    for line in reader:
        yield (line[0], line[1])


if __name__ == '__main__':
    base_url = 'https://www.askme.com/search?q='
    outfile = open('askme_mumbai.csv','w')
    categories = map(lambda x: x.strip(),open('categories').readlines())
    wrtr = UnicodeWriter(outfile)
    wrtr.writerow(['category','name','address','phone'])
    for locality,city in locations('mumbai_localities.csv'):
        loc = locality.replace('.','').replace('-','').replace(' ','+')
        for category in categories:
            cat = category.strip().replace(' ','+') + '+'
            url = base_url+cat+'in+'+loc+'&type=outlets&city='+city
            writerecord(url,wrtr,category)
    outfile.close()
示例#7
0
 def file_Write(self,outputList):
     with open('translated.csv', 'wb') as files:
         writer = UnicodeWriter(files,quoting=csv.QUOTE_ALL)
         writer.writerows(outputList)