def row_to_content_obj(key_row): '''Returns ``FeatureCollection`` given an HBase artifact row. Note that the FC returned has a Unicode feature ``artifact_id`` set to the row's key. ''' key, row = key_row cid = mk_content_id(key.encode('utf-8')) response = row.get('response', {}) other_bows = defaultdict(StringCounter) for attr, val in row.get('indices', []): other_bows[attr][val] += 1 try: artifact_id = key if isinstance(artifact_id, str): artifact_id = unicode(artifact_id, 'utf-8') fc = html_to_fc(response.get('body', ''), url=row.get('url'), timestamp=row.get('timestamp'), other_features=dict(other_bows, **{'artifact_id': artifact_id})) except: fc = None print('Could not create FC for %s:' % cid, file=sys.stderr) print(traceback.format_exc(), file=sys.stderr) return cid, fc
def row_to_content_obj(key_row): '''Returns ``FeatureCollection`` given an HBase artifact row. Note that the FC returned has a Unicode feature ``artifact_id`` set to the row's key. ''' key, row = key_row cid = mk_content_id(key.encode('utf-8')) response = row.get('response', {}) other_bows = defaultdict(StringCounter) for attr, val in row.get('indices', []): other_bows[attr][val] += 1 try: artifact_id = key if isinstance(artifact_id, str): artifact_id = unicode(artifact_id, 'utf-8') fc = html_to_fc( response.get('body', ''), url=row.get('url'), timestamp=row.get('timestamp'), other_features=dict(other_bows, **{'artifact_id': artifact_id})) except: fc = None print('Could not create FC for %s:' % cid, file=sys.stderr) print(traceback.format_exc(), file=sys.stderr) return cid, fc
def feature_pipeline(chunk_in, FC_chunk_out): '''Run a basic pipeline to generate feature collections from streamitems. If file exists just loads the existing file. Returns a list of either the generated FCs or the FCs in the existing file. `chunk_in` path to SC chunk file `FC_chunk_out` path where the FC chunk file be written ''' if isfile(FC_chunk_out): print FC_chunk_out, 'already exists...', fcs = [fc for fc in FC_Chunk(FC_chunk_out, mode='rb')] print 'loaded.' else: chunk_out = FC_Chunk(FC_chunk_out, mode='wb') fcs = [] for cfile in glob.glob(join(chunk_in,'*.sc.xz')): print 'processing', cfile for i, si in enumerate(SC_Chunk(cfile)): if i % 10==0: print i, 'fc processed' fc = html_to_fc( html=si.body.raw, encoding=si.body.encoding, url=si.abs_url) chunk_out.add(fc) fcs.append(fc) print 'done creating', FC_chunk_out return fcs
def from_forum_post(row): cid = forum_post_id(row) try: fc = html_to_fc(row['content'].strip(), url=row['thread_link'], timestamp=forum_post_timestamp(row), other_features=forum_post_features(row)) except: fc = None print('Could not create FC for %s:' % cid, file=sys.stderr) print(traceback.format_exc()) return cid, fc