def large_social_networks_twitter(): # count = 0 degree_thrd = 3 index = [] G = nx.Graph() with open('/mnt/wzhan139/cross media data/Twitter/twitter_followees.bson', "rb") as f: data = bson.decode_file_iter(f, bson.CodecOptions(unicode_decode_error_handler="ignore")) count = 0 for c, d in enumerate(data): print("Reading node "+ str(c)) index.append(d['user_name']) G.add_node(d['user_name']) count += 1 with open('/mnt/wzhan139/cross media data/Twitter/twitter_followees.bson', "rb") as f: data = bson.decode_file_iter(f, bson.CodecOptions(unicode_decode_error_handler="ignore")) for c, d in enumerate(data): print("Constructing graph in node " + str(c)) for j in range(len(d['followees'])): if G.has_node(d['followees'][j]['screen_name']): G.add_edge(d['user_name'], d['followees'][j]['screen_name']) with open('/mnt/wzhan139/cross media data/Twitter/twitter_followers.bson', "rb") as f: data = bson.decode_file_iter(f, bson.CodecOptions(unicode_decode_error_handler="ignore")) for c, d in enumerate(data): print("Constructing graph in node " + str(c)) for i in range(len(d['followers'])): if G.has_node(d['followers'][i]['screen_name']): G.add_edge(d['user_name'], d['followers'][i]['screen_name']) G2 = nx.convert_node_labels_to_integers(G,label_attribute='old_label') num_node = nx.adjacency_matrix(G2).shape[0] sparsity = G2.number_of_edges() / num_node ** 2 print("no thredshold graph sparsity is " + str(sparsity)) print(nx.info(G2)) nx.write_gpickle(G2, "twitter.nothred.gpickle") remove_node=[] for n, d in G2.nodes(data=True): if G2.degree(n)<degree_thrd: remove_node.append(n) G2.remove_nodes_from(np.asarray(remove_node)) G2 = nx.convert_node_labels_to_integers(G2) num_node=nx.adjacency_matrix(G2).shape[0] G3 = nx.from_scipy_sparse_matrix(sp.dia_matrix((np.ones(num_node), 0), shape=nx.adjacency_matrix(G2).shape)) G4=nx.compose(G2,G3) nx.write_gpickle(G4, "twitter.gpickle") nx.write_adjlist(G4, "twitter_adj") nx.write_edgelist(G4, "twitter_edgelist") sparsity = G4.number_of_edges()/num_node**2 print("sparsity is "+ str(sparsity)) print(nx.info(G4))
def get(self, mydb, mycol, who, what=None, sort=None, son=False, limit=None): """ db.mycol.find(who,what) """ try: self.connect(mydb) data = list() if (son): opts = bson.CodecOptions(document_class=bson.SON) col = self.dbase[mycol].with_options(codec_options=opts) else: col = self.dbase[mycol] if what: cursor = col.find(who, what) else: cursor = col.find(who) if sort: srt = [] for tpl in sort: if tpl[1] == 1: srt.append((tpl[0], pymongo.ASCENDING)) else: srt.append((tpl[0], pymongo.DESCENDING)) cursor = cursor.sort(srt) if limit: cursor = cursor.limit(limit) for doc in cursor: data.append(doc) return data except: l.log_exception('MyMongo.get')
def _codec_options(**options): """ bson.BSON.{decode{,_all},encode} can receive bson.CodecOptions. :return: :class:`~bson.CodecOptions` """ opts = anyconfig.utils.filter_options(_CO_OPTIONS, options) return bson.CodecOptions(**opts)
def iter_records(curs, *names): opts = bson.CodecOptions(document_class=bson.raw_bson.RawBSONDocument) trie = PathTrie([n.split('.') for n in names]) template = dict((n, None) for n in names) for doc in curs: rec = dict(template) for k, v in get_fields(doc.raw, opts, trie): rec['.'.join(k)] = v yield tuple(rec[n] for n in names)
def bson_as_json(value, debugger, verbose=False, oneline=False, raw=False): try: target = debugger.GetSelectedTarget() inline_t = target.FindFirstType('bson_impl_inline_t') alloc_t = target.FindFirstType('bson_impl_alloc_t') if not inline_t.GetDisplayTypeName(): return """error: libbson not compiled with debug symbols Download latest mongo-c-driver.tar.gz from mongoc.org and do: ./configure --enable-debug make sudo make install """ if value.TypeIsPointerType(): value = value.Dereference() length = value.GetChildMemberWithName('len').GetValueAsUnsigned() flags = value.GetChildMemberWithName('flags').GetValueAsUnsigned() if flags & ~ALL_FLAGS or length < 5 or length > 16 * 1024 * 1024: return 'uninitialized' if flags & FLAGS['INLINE']: if length > 120: return 'uninitialized' inline = value.Cast(inline_t) data = inline.GetChildMemberWithName('data') raw_bson = get_inline_bytes(data) else: alloc = value.Cast(alloc_t) offset = alloc.GetChildMemberWithName( 'offset').GetValueAsUnsigned() buf = alloc.GetChildMemberWithName('buf').Dereference() raw_bson = get_allocated_bytes(buf, offset, debugger) if raw: return repr(raw_bson) ret = '' if verbose: ret += 'len=%s\n' % length ret += flags_str(flags) + '\n' if oneline: indent = None else: indent = 2 codec_options = bson.CodecOptions(document_class=DuplicateKeyDict) ret += json_util.dumps(bson.BSON(raw_bson).decode(codec_options), indent=indent) return ret except Exception as exc: return str(exc)
def __init__(self, db): self.db = db if isinstance(self.db, dict): self.materials = RecursiveDict() self.compositions = RecursiveDict() else: opts = bson.CodecOptions(document_class=bson.SON) self.contributions = self.db.contributions.with_options(codec_options=opts) self.materials = self.db.materials.with_options(codec_options=opts) self.compositions = self.db.compositions.with_options(codec_options=opts)
def bson_dumps(raw_bson, oneline): if not bson: return "No PyMongo, do `python -m pip install pymongo`" codec_options = bson.CodecOptions(document_class=DuplicateKeyDict) if oneline: indent = None else: indent = 2 return json_util.dumps(bson.BSON(raw_bson).decode(codec_options), indent=indent)
def __init__(self, db): self.db = db if isinstance(self.db, dict): self.materials = RecursiveDict() self.compositions = RecursiveDict() else: import plotly.plotly as py import cufflinks cufflinks.set_config_file(world_readable=True, theme='pearl') opts = bson.CodecOptions(document_class=bson.SON) self.contributions = self.db.contributions.with_options(codec_options=opts) self.materials = self.db.materials.with_options(codec_options=opts) self.compositions = self.db.compositions.with_options(codec_options=opts)
def __init__(self, db=None): self.db = db try: from faker import Faker self.fake = Faker() except: self.fake = None if self.db is not None: opts = bson.CodecOptions(document_class=bson.SON) self.contributions = self.db.contributions.with_options( codec_options=opts) self.materials = self.db.materials.with_options(codec_options=opts) self.compositions = self.db.compositions.with_options( codec_options=opts)
def __init__(self, db): self.db = db self.nbdir = os.path.dirname(os.path.abspath(__file__)) self.ep = ExecutePreprocessor(timeout=600, kernel_name='python2', allow_errors=False) if isinstance(self.db, dict): self.materials = RecursiveDict() self.compositions = RecursiveDict() else: opts = bson.CodecOptions(document_class=bson.SON) self.contributions = self.db.contributions.with_options( codec_options=opts) self.materials = self.db.materials.with_options(codec_options=opts) self.compositions = self.db.compositions.with_options( codec_options=opts)
def _check_colls_eq(primary_db, secondary_db, coll_name, sb): """ Appends information to 'sb' about the differences or between the 'coll_name' collection on the primary and the 'coll_name' collection on the secondary, if any. """ codec_options = bson.CodecOptions(document_class=bson.SON) primary_coll = primary_db.get_collection(coll_name, codec_options=codec_options) secondary_coll = secondary_db.get_collection( coll_name, codec_options=codec_options) primary_docs = CheckReplDBHash._extract_documents(primary_coll) secondary_docs = CheckReplDBHash._extract_documents(secondary_coll) CheckReplDBHash._get_collection_diff(primary_docs, secondary_docs, sb)
def bson_iter_type_summary(value, internal_dict): """Format a bson_iter_t as a chunk of JSON and a ^ marking the position.""" try: if value.TypeIsPointerType(): value = value.Dereference() length = value.GetChildMemberWithName('len').GetValueAsUnsigned() data = value.GetChildMemberWithName('raw') raw_bson = get_allocated_bytes(data, 0, lldb.debugger) key_offset = value.GetChildMemberWithName('key').unsigned # JSONify the BSON document. codec_options = bson.CodecOptions(document_class=DuplicateKeyDict) as_dict = bson.BSON(raw_bson).decode(codec_options) obj = json_util._json_convert(as_dict, json_util.DEFAULT_JSON_OPTIONS) as_json = json_util.dumps(as_dict) if key_offset: # Find the iter's position. # HACK, fails if there are dupe keys, or a value same as this key. key = '"%s"' % get_cstring(data, key_offset, length, lldb.debugger) pos = 0 for chunk in json.JSONEncoder().iterencode(obj): if chunk == key: break pos += len(chunk) else: pos = 0 width = lldb.debugger.GetTerminalWidth() - 1 # Where in the JSON string to start printing. start = max(0, pos - width / 2) end = min(len(as_json), pos + width / 2) ret = '\n' ret += as_json[start:end] ret += '\n' ret += ' ' * (pos - start) + '^' ret += '\n' return ret except Exception as exc: return str(exc)
def getOne(self, mydb, mycol, who, what=None, sort=None, son=False): """ db.mycol.findOne(who,what)[key] """ try: self.connect(mydb) if (son): opts = bson.CodecOptions(document_class=bson.SON) col = self.dbase[mycol].with_options(codec_options=opts) else: col = self.dbase[mycol] if sort: srt = [] for tpl in sort: if tpl[1] == 1: srt.append((tpl[0], pymongo.ASCENDING)) else: srt.append((tpl[0], pymongo.DESCENDING)) else: srt = [] if what: data = col.find_one(who, what, sort=srt) else: data = col.find_one(who, sort=srt) return data except: l.log_exception('MyMongo.getOne')
def convert(source_path, target_path): from pm4pymdl.util.parquet_exporter import exporter as parquet_exporter limit_length = 35 codec_options = bson.CodecOptions(unicode_decode_error_handler='ignore') gen = bson.decode_file_iter(open(source_path, 'rb'), codec_options=codec_options) json_list = [] df_list = [] i = 0 for row in gen: cols = list(row.keys()) for col in cols: if type(row[col]) is dict: if "id" in row[col]: row[col] = row[col]["id"] else: del row[col] elif type(row[col]) is str and len(row[col]) > limit_length: del row[col] json_list.append(row) if i > 0 and i % 10000 == 0: print(i) df = pd.DataFrame(json_list) df_list.append(df) json_list = None json_list = [] i = i + 1 if json_list: df = pd.DataFrame(json_list) df_list.append(df) if df_list: overall_df = pd.concat(df_list) for col in overall_df.columns: overall_df[col] = overall_df[col].astype(str) parquet_exporter.apply(overall_df, target_path, parameters={"compression": "gzip"})
def get_metrics(urls, req_method="GET", timeout=45, req_type='html', req_data=None): """ Sends http requests based on the provided urls and collect the corresponding metrics messages Keyword arguments: urls -- The URLS to call. req_method -- The Http method for the call timeout -- timeout used for waiting for metrics response, should be set slightly greater than the metrics reporting interval """ print('Collecting metrics from udp socket {0}:{1}'.format( LISTEN_HOST, LISTEN_PORT)) udpsock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) udpsock.settimeout(1) udpsock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) udpsock.bind(( LISTEN_HOST, LISTEN_PORT, )) start_time = time.time() try: for url in urls: _make_call(url, req_method, req_data, req_type) except Exception as ex: print('error on {0}'.format(urls)) return None metrics = [] keep_receiving = True first_ping = False timer_5s = 0 while keep_receiving: if (time.time() - start_time > timeout): print('Failed to collect metrics after waiting for {0}'.format( timeout)) keep_receiving = False else: if timer_5s > 0 and (time.time() - timer_5s > 5): keep_receiving = False try: ret = udpsock.recv(BUFFER_SIZE) if ret: bson_data = bson.BSON(ret) try: data = bson_data.to_dict(mdict) except AttributeError: data = bson_data.decode(codec_options=bson.CodecOptions( document_class=mdict)) if "measurements" in data: #then it's a metrics message for m in data["measurements"]: if "tags" in m or first_ping or "IsCustom" in m: metrics.append(data) data = None if first_ping: keep_receiving = False elif timer_5s == 0: timer_5s = time.time() break else: print( 'Failed to collect metrics from udp socket {0}:{1} gave empty response' .format(LISTEN_HOST, LISTEN_PORT)) except socket.timeout: pass udpsock.close() return metrics
import lzma import sys import bson # from PyMongo with open("commits.bin", "wb") as commits: with lzma.open("repos.txt.xz", "wb") as repos: with lzma.open("messages.txt.xz", "wb") as messages: for obj in bson.decode_file_iter(sys.stdin.buffer, codec_options=bson.CodecOptions( unicode_decode_error_handler="ignore")): commits.write(bytes.fromhex(obj["sha"])) repos.write(obj["commit"]["url"][29:-53].encode()) repos.write(b"\0") messages.write(obj["commit"]["message"].encode(errors="ignore")) messages.write(b"\0")
import lzma import sys import bson # from PyMongo with open("commits.bin", "wb") as commits: with lzma.open("repos.txt.xz", "wb") as repos: with lzma.open("messages.txt.xz", "wb") as messages: while True: try: for obj in bson.decode_file_iter( sys.stdin.buffer, codec_options=bson.CodecOptions( unicode_decode_error_handler="ignore")): try: obj["sha"] obj["commit"]["url"] obj["commit"]["message"] except KeyError: continue commits.write(bytes.fromhex(obj["sha"])) repos.write(obj["commit"]["url"][29:-53].encode()) repos.write(b"\0") messages.write( obj["commit"]["message"].encode(errors="ignore")) messages.write(b"\0") break except bson.errors.InvalidBSON: continue
def iter_elements(data, recurse=True): opts = bson.CodecOptions(document_class=bson.raw_bson.RawBSONDocument) for kpath, val in _iter_elements(data, opts, recurse): yield '.'.join(kpath), val
def getTZCodecOptions(): return bson.CodecOptions(tz_aware=True, tzinfo=pytz.timezone(app.config.get('TZ')))
def get_events_and_response(url, num_expected=1000, timeout=5, strict=True, inits_expected=True, jmx_expected=False, req_method="GET", req_data=None, req_type='html', headers=None): """ Returns the events, init events, and the response object init events are in a list response object will have a 'code' property for when it is 200 and also for "exotic http errors" read https://docs.python.org/2/library/urllib2.html for more info however, if the domain is not available at all (500 code), then the response will be the Exception object which will not have a code property Parameters: url -- The URL to call. Can be a str, urllib2.Request object, or callable() object. num_expected [1000] -- The number of events to expect. If more are found, an error is thrown. timeout [3] -- The socket listen timeout in seconds strict [True] -- only valid with num_expected; ensure that num == num_expected. inits_expected [True] -- If True, __Init event pairs will be filtered into the returned `inits` array. The number of remaining events are checked against num_expected. Set to False to skip this processing. """ func = url if callable(url) else functools.partial( _make_call, url, req_method, req_data, req_type, headers) with UdpCollector(LISTEN_HOST, LISTEN_PORT, timeout) as uc: start_time = time.time() full_data = [] try: response = func() except Exception as ex: response = ex while len(full_data) < MAX_EVENTS: try: ret = uc.recv(BUFFER_SIZE) full_data.append(ret) except socket.timeout: break else: if time.time() - start_time > timeout: break events = [] for data in full_data: if data: bson_data = bson.BSON(data) try: # to_dict() is used for very old pymongo library # it has been replaced by decode() now. evts = bson_data.to_dict(mdict) except AttributeError: evts = bson_data.decode(codec_options=bson.CodecOptions( document_class=mdict)) events.append(evts) inits = [] jmxs = [] measurements = [] # Only prints on failure when run through nose print('Events collected: {0}'.format(len(events))) _debug_print(events) _filter_measurements(events, measurements) print('Measurements after filtering:') _debug_print(measurements) if inits_expected: _filter_inits(events, inits) print('Inits after filtering:') _debug_print(inits) if jmx_expected: _filter_jmxs(events, jmxs) print('JMX events after filtering:') _debug_print(jmxs) print('Events after filtering:') _debug_print(events) if num_expected != -1: if len(events) > num_expected: raise ExtraEventsError(num_expected, len(events)) if strict and len(events) != num_expected: raise MissingEventsError(num_expected, len(events)) return events, inits, response
def get_events_via_udp(udp, url, num_expected=1000, timeout=3, strict=True, inits_expected=True, jmx_expected=False, req_method="GET", req_data=None, req_type='html', repeat_times=1): """ Wrapper for get_events_and_response() that just returns events. if req_method="GET", req_data will be ignored if req_method="POST", req_data could be a dictionary of values or an encoded string of values to be sent with the post request if req_method="PUT", req_data is a string that will be sent to the body :params url:string request url """ use_udp = not udp is None if udp is None: udp = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) udp.settimeout(timeout) udp.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) udp.bind(( host, port, )) func = functools.partial(_make_call, url, req_method, req_data, req_type, headers) start_time = time.time() full_data = [] try: for _ in range(repeat_times): response = func() except Exception as ex: response = ex while len(full_data) < MAX_EVENTS: try: ret = udp.recv(BUFFER_SIZE) full_data.append(ret) except socket.timeout: break else: if time.time() - start_time > timeout: break events = [] for data in full_data: if data: bson_data = bson.BSON(data) try: # to_dict() is used for very old pymongo library # it has been replaced by decode() now. evts = bson_data.to_dict(mdict) except AttributeError: evts = bson_data.decode(codec_options=bson.CodecOptions( document_class=mdict)) events.append(evts) inits = [] jmxs = [] measurements = [] # Only prints on failure when run through nose print('Events collected: {0}'.format(len(events))) _debug_print(events) _filter_measurements(events, measurements) print('Measurements after filtering:') _debug_print(measurements) if inits_expected: _filter_inits(events, inits) print('Inits after filtering:') _debug_print(inits) if jmx_expected: _filter_jmxs(events, jmxs) print('JMX events after filtering:') _debug_print(jmxs) print('Events after filtering:') _debug_print(events) if num_expected != -1: if len(events) > num_expected: raise ExtraEventsError(num_expected, len(events)) if strict and len(events) != num_expected: raise MissingEventsError(num_expected, len(events)) return events, inits, response
def get_collection(self, collection): if not self.is_login: raise AuthErr return self.__db[collection].with_options( codec_options=bson.CodecOptions( unicode_decode_error_handler="ignore"))