def scanner(self, numRows=100, startRow=None, stopRow=None): scan = Hbase.TScan(startRow, stopRow) scannerId = self.client.scannerOpenWithScan(self.table, scan, {}) # row = self.client.scannerGet(scannerId) ret = [] rowList = self.client.scannerGetList(scannerId, numRows) while rowList: for r in rowList: rd = {'row': r.row} for k, v in r.columns.iteritems(): cf, qualifier = k.split(':') if qualifier not in rd: rd[qualifier] = {} idx = self.columnFamilies.index(cf) if self.columnFamiliesType[idx] == str: rd[qualifier].update({cf: v.value}) elif self.columnFamiliesType[idx] == int: rd[qualifier].update({cf: decode(v.value)}) ret.append(rd) rowList = self.client.scannerGetList(scannerId, numRows) self.client.scannerClose(scannerId) return ret
def get_statuses(self, uid): key_beg = pack_mid(uid, 0) key_end = pack_mid(uid, 0x7fffffffffffffff) scan = Hbase.TScan(startRow=key_beg, stopRow=key_end) client = self._get_client() scanner = client.scannerOpenWithScan(self.cfg['table_status'], scan, None) i = 0 while True: i += 1 row_list = client.scannerGetList(scanner, i) if not row_list: break for row in row_list: (status, repost) = load_status(row.columns) if status is not None: status.__dict__.pop('batches') ret = {} ret.update(status.__dict__) if repost is not None: repost.__dict__.pop('batches') ret['retweeted_status'] = repost.__dict__ yield ret client.scannerClose(scanner)
def scanWithKeyword(self, __filter): scan = Hbase.TScan() #print "ValueFilter(=,'substring:%s')" %(__filter) scan.columns = ['content:0'] scan.filterString = "ValueFilter(=,'substring:%s')" % (__filter) scannerId = self.client.scannerOpenWithScan(self.table, scan, {}) result = self.client.scannerGetList(scannerId, 100) return result
def from_crawler(cls, crawler): # cls.http_proxies = crawler.settings.get('HTTP_PROXIES', False) # if not cls.http_proxies: # raise NotConfigured host = crawler.settings.get('HBASE_HOST') port = crawler.settings.get('HBASE_PORT') table = crawler.settings.get('PROXY_TABLE') # cls.stats = crawler.stats cls.hbase = HbaseWrapper(host, port, table) cls.mutex = thread.allocate_lock() cls.timeout = crawler.settings.get('PROXIES_TIMEOUT') cls.tscan = Hbase.TScan(columns=['cf:0'], caching=True, batchSize=20) cls._get_proxies() s = cls() crawler.signals.connect(s.spider_closed, signal=signals.spider_closed) return s
def scanner(self, numRows=100, startRow=None, stopRow=None): """ :param numRows: :param startRow: :param stopRow: :return: """ scan = Hbase.TScan(startRow, stopRow) scannerId = self.client.scannerOpenWithScan(self.table, scan, {}) ret = [] rowList = self.client.scannerGetList(scannerId, numRows) for r in rowList: rd = {} row = r.row.decode('utf-8') value = (r.columns[b'info:name'].value).decode('utf-8') rd[row] = value # print ('the row is ',r.row.decode('utf-8')) # print ('the value is ',(r.columns[b'info:name'].value).decode('utf-8')) ret.append(rd) return ret
mutationsbatch.append(Hbase.BatchMutation(row='9', mutations=mutations)) mutations = [ Hbase.Mutation(column='info:FULLNAME', value='Ronald Adina'), Hbase.Mutation(column='info:AGE', value='41'), Hbase.Mutation(column='contact:EMAILID', value='*****@*****.**'), Hbase.Mutation(column='contact:PHONE', value='453-555-0165'), Hbase.Mutation(column='others:MODIFIEDDATE', value='5/16/2005 4:33:33 PM') ] mutationsbatch.append( Hbase.BatchMutation(row='10', mutations=mutations)) client.mutateRows(tableName, mutationsbatch, None) scan = Hbase.TScan(startRow=None, stopRow=None) scannerId = client.scannerOpenWithScan(tableName, scan, None) scanValues = client.scannerGet(scannerId) if len(scanValues) == 1: while len(scanValues) == 1: for row in scanValues: print '\n' print '%s' % (row.row), column = row.columns for values in column: print '%s' % (row.columns.get(values).value), scanValues = client.scannerGet(scannerId) client.scannerClose(scannerId) transport.close() except Thrift.TException, tx:
rows = client.getRow(tablename, "shakespeare-comedies-000001") # Do a pull on a single row for row in rows: # Pull out values in cell message = row.columns.get(messagecolumncf).value username = row.columns.get(usernamecolumncf).value linenumber = decode(row.columns.get(linenumbercolumncf).value) rowKey = row.row print("Got row: " + rowKey + ":" + str(linenumber) + ":" + username + ":" + message) # Open a scan over all comedy rows in Shakespeare scan = Hbase.TScan(startRow="shakespeare-comedies-000001", stopRow="shakespeare-comedies-999999") scannerId = client.scannerOpenWithScan(tablename, scan) # Go through every row passed back by scanner row = client.scannerGet(scannerId) # Go through every row passed back by scanner rowList = client.scannerGetList(scannerId, numRows) while rowList: for row in rowList: # Pull out values in columns message = row.columns.get(messagecolumncf).value username = row.columns.get(usernamecolumncf).value linenumber = decode(row.columns.get(linenumbercolumncf).value)