Пример #1
0
def large_social_networks_twitter():
    # count = 0

    degree_thrd = 3


    index = []
    G = nx.Graph()
    with open('/mnt/wzhan139/cross media data/Twitter/twitter_followees.bson', "rb") as f:
        data = bson.decode_file_iter(f, bson.CodecOptions(unicode_decode_error_handler="ignore"))
        count = 0
        for c, d in enumerate(data):
            print("Reading node "+ str(c))
            index.append(d['user_name'])
            G.add_node(d['user_name'])
            count += 1
    with open('/mnt/wzhan139/cross media data/Twitter/twitter_followees.bson', "rb") as f:
        data = bson.decode_file_iter(f, bson.CodecOptions(unicode_decode_error_handler="ignore"))
        for c, d in enumerate(data):
            print("Constructing graph in node " + str(c))
            for j in range(len(d['followees'])):
                if G.has_node(d['followees'][j]['screen_name']):
                    G.add_edge(d['user_name'], d['followees'][j]['screen_name'])
    with open('/mnt/wzhan139/cross media data/Twitter/twitter_followers.bson', "rb") as f:
        data = bson.decode_file_iter(f, bson.CodecOptions(unicode_decode_error_handler="ignore"))
        for c, d in enumerate(data):
            print("Constructing graph in node " + str(c))
            for i in range(len(d['followers'])):
                if G.has_node(d['followers'][i]['screen_name']):
                    G.add_edge(d['user_name'], d['followers'][i]['screen_name'])


    G2 = nx.convert_node_labels_to_integers(G,label_attribute='old_label')
    num_node = nx.adjacency_matrix(G2).shape[0]
    sparsity = G2.number_of_edges() / num_node ** 2
    print("no thredshold graph sparsity is " + str(sparsity))
    print(nx.info(G2))
    nx.write_gpickle(G2, "twitter.nothred.gpickle")

    remove_node=[]
    for n, d in G2.nodes(data=True):
        if G2.degree(n)<degree_thrd:
            remove_node.append(n)
    G2.remove_nodes_from(np.asarray(remove_node))
    G2 = nx.convert_node_labels_to_integers(G2)

    num_node=nx.adjacency_matrix(G2).shape[0]
    G3 = nx.from_scipy_sparse_matrix(sp.dia_matrix((np.ones(num_node), 0), shape=nx.adjacency_matrix(G2).shape))
    G4=nx.compose(G2,G3)
    nx.write_gpickle(G4, "twitter.gpickle")
    nx.write_adjlist(G4, "twitter_adj")
    nx.write_edgelist(G4, "twitter_edgelist")

    sparsity = G4.number_of_edges()/num_node**2
    print("sparsity is "+ str(sparsity))
    print(nx.info(G4))
Пример #2
0
    def get(self,
            mydb,
            mycol,
            who,
            what=None,
            sort=None,
            son=False,
            limit=None):
        """ db.mycol.find(who,what) """
        try:
            self.connect(mydb)
            data = list()
            if (son):
                opts = bson.CodecOptions(document_class=bson.SON)
                col = self.dbase[mycol].with_options(codec_options=opts)
            else:
                col = self.dbase[mycol]
            if what: cursor = col.find(who, what)
            else: cursor = col.find(who)
            if sort:
                srt = []
                for tpl in sort:
                    if tpl[1] == 1: srt.append((tpl[0], pymongo.ASCENDING))
                    else: srt.append((tpl[0], pymongo.DESCENDING))
                cursor = cursor.sort(srt)
            if limit: cursor = cursor.limit(limit)

            for doc in cursor:
                data.append(doc)
            return data
        except:
            l.log_exception('MyMongo.get')
Пример #3
0
def _codec_options(**options):
    """
    bson.BSON.{decode{,_all},encode} can receive bson.CodecOptions.

    :return: :class:`~bson.CodecOptions`
    """
    opts = anyconfig.utils.filter_options(_CO_OPTIONS, options)
    return bson.CodecOptions(**opts)
Пример #4
0
def iter_records(curs, *names):
    opts = bson.CodecOptions(document_class=bson.raw_bson.RawBSONDocument)
    trie = PathTrie([n.split('.') for n in names])
    template = dict((n, None) for n in names)
    for doc in curs:
        rec = dict(template)
        for k, v in get_fields(doc.raw, opts, trie):
            rec['.'.join(k)] = v
        yield tuple(rec[n] for n in names)
Пример #5
0
def bson_as_json(value, debugger, verbose=False, oneline=False, raw=False):
    try:
        target = debugger.GetSelectedTarget()
        inline_t = target.FindFirstType('bson_impl_inline_t')
        alloc_t = target.FindFirstType('bson_impl_alloc_t')

        if not inline_t.GetDisplayTypeName():
            return """error: libbson not compiled with debug symbols
Download latest mongo-c-driver.tar.gz from mongoc.org and do:
./configure --enable-debug
make
sudo make install
"""

        if value.TypeIsPointerType():
            value = value.Dereference()

        length = value.GetChildMemberWithName('len').GetValueAsUnsigned()
        flags = value.GetChildMemberWithName('flags').GetValueAsUnsigned()

        if flags & ~ALL_FLAGS or length < 5 or length > 16 * 1024 * 1024:
            return 'uninitialized'

        if flags & FLAGS['INLINE']:
            if length > 120:
                return 'uninitialized'

            inline = value.Cast(inline_t)
            data = inline.GetChildMemberWithName('data')
            raw_bson = get_inline_bytes(data)
        else:
            alloc = value.Cast(alloc_t)
            offset = alloc.GetChildMemberWithName(
                'offset').GetValueAsUnsigned()
            buf = alloc.GetChildMemberWithName('buf').Dereference()
            raw_bson = get_allocated_bytes(buf, offset, debugger)

        if raw:
            return repr(raw_bson)

        ret = ''
        if verbose:
            ret += 'len=%s\n' % length
            ret += flags_str(flags) + '\n'

        if oneline:
            indent = None
        else:
            indent = 2

        codec_options = bson.CodecOptions(document_class=DuplicateKeyDict)
        ret += json_util.dumps(bson.BSON(raw_bson).decode(codec_options),
                               indent=indent)
        return ret
    except Exception as exc:
        return str(exc)
Пример #6
0
 def __init__(self, db):
     self.db = db
     if isinstance(self.db, dict):
         self.materials = RecursiveDict()
         self.compositions = RecursiveDict()
     else:
         opts = bson.CodecOptions(document_class=bson.SON)
         self.contributions = self.db.contributions.with_options(codec_options=opts)
         self.materials = self.db.materials.with_options(codec_options=opts)
         self.compositions = self.db.compositions.with_options(codec_options=opts)
Пример #7
0
def bson_dumps(raw_bson, oneline):
    if not bson:
        return "No PyMongo, do `python -m pip install pymongo`"

    codec_options = bson.CodecOptions(document_class=DuplicateKeyDict)
    if oneline:
        indent = None
    else:
        indent = 2

    return json_util.dumps(bson.BSON(raw_bson).decode(codec_options),
                           indent=indent)
Пример #8
0
 def __init__(self, db):
     self.db = db
     if isinstance(self.db, dict):
         self.materials = RecursiveDict()
         self.compositions = RecursiveDict()
     else:
         import plotly.plotly as py
         import cufflinks
         cufflinks.set_config_file(world_readable=True, theme='pearl')
         opts = bson.CodecOptions(document_class=bson.SON)
         self.contributions = self.db.contributions.with_options(codec_options=opts)
         self.materials = self.db.materials.with_options(codec_options=opts)
         self.compositions = self.db.compositions.with_options(codec_options=opts)
Пример #9
0
 def __init__(self, db=None):
     self.db = db
     try:
         from faker import Faker
         self.fake = Faker()
     except:
         self.fake = None
     if self.db is not None:
         opts = bson.CodecOptions(document_class=bson.SON)
         self.contributions = self.db.contributions.with_options(
             codec_options=opts)
         self.materials = self.db.materials.with_options(codec_options=opts)
         self.compositions = self.db.compositions.with_options(
             codec_options=opts)
Пример #10
0
 def __init__(self, db):
     self.db = db
     self.nbdir = os.path.dirname(os.path.abspath(__file__))
     self.ep = ExecutePreprocessor(timeout=600,
                                   kernel_name='python2',
                                   allow_errors=False)
     if isinstance(self.db, dict):
         self.materials = RecursiveDict()
         self.compositions = RecursiveDict()
     else:
         opts = bson.CodecOptions(document_class=bson.SON)
         self.contributions = self.db.contributions.with_options(
             codec_options=opts)
         self.materials = self.db.materials.with_options(codec_options=opts)
         self.compositions = self.db.compositions.with_options(
             codec_options=opts)
Пример #11
0
    def _check_colls_eq(primary_db, secondary_db, coll_name, sb):
        """
        Appends information to 'sb' about the differences or between
        the 'coll_name' collection on the primary and the 'coll_name'
        collection on the secondary, if any.
        """

        codec_options = bson.CodecOptions(document_class=bson.SON)

        primary_coll = primary_db.get_collection(coll_name,
                                                 codec_options=codec_options)
        secondary_coll = secondary_db.get_collection(
            coll_name, codec_options=codec_options)

        primary_docs = CheckReplDBHash._extract_documents(primary_coll)
        secondary_docs = CheckReplDBHash._extract_documents(secondary_coll)

        CheckReplDBHash._get_collection_diff(primary_docs, secondary_docs, sb)
Пример #12
0
def bson_iter_type_summary(value, internal_dict):
    """Format a bson_iter_t as a chunk of JSON and a ^ marking the position."""
    try:
        if value.TypeIsPointerType():
            value = value.Dereference()

        length = value.GetChildMemberWithName('len').GetValueAsUnsigned()
        data = value.GetChildMemberWithName('raw')
        raw_bson = get_allocated_bytes(data, 0, lldb.debugger)
        key_offset = value.GetChildMemberWithName('key').unsigned

        # JSONify the BSON document.
        codec_options = bson.CodecOptions(document_class=DuplicateKeyDict)
        as_dict = bson.BSON(raw_bson).decode(codec_options)
        obj = json_util._json_convert(as_dict, json_util.DEFAULT_JSON_OPTIONS)
        as_json = json_util.dumps(as_dict)

        if key_offset:
            # Find the iter's position.
            # HACK, fails if there are dupe keys, or a value same as this key.
            key = '"%s"' % get_cstring(data, key_offset, length, lldb.debugger)
            pos = 0
            for chunk in json.JSONEncoder().iterencode(obj):
                if chunk == key:
                    break

                pos += len(chunk)
        else:
            pos = 0

        width = lldb.debugger.GetTerminalWidth() - 1
        # Where in the JSON string to start printing.
        start = max(0, pos - width / 2)
        end = min(len(as_json), pos + width / 2)

        ret = '\n'
        ret += as_json[start:end]
        ret += '\n'
        ret += ' ' * (pos - start) + '^'
        ret += '\n'
        return ret
    except Exception as exc:
        return str(exc)
Пример #13
0
 def getOne(self, mydb, mycol, who, what=None, sort=None, son=False):
     """ db.mycol.findOne(who,what)[key] """
     try:
         self.connect(mydb)
         if (son):
             opts = bson.CodecOptions(document_class=bson.SON)
             col = self.dbase[mycol].with_options(codec_options=opts)
         else:
             col = self.dbase[mycol]
         if sort:
             srt = []
             for tpl in sort:
                 if tpl[1] == 1: srt.append((tpl[0], pymongo.ASCENDING))
                 else: srt.append((tpl[0], pymongo.DESCENDING))
         else: srt = []
         if what: data = col.find_one(who, what, sort=srt)
         else: data = col.find_one(who, sort=srt)
         return data
     except:
         l.log_exception('MyMongo.getOne')
Пример #14
0
def convert(source_path, target_path):
    from pm4pymdl.util.parquet_exporter import exporter as parquet_exporter
    limit_length = 35
    codec_options = bson.CodecOptions(unicode_decode_error_handler='ignore')
    gen = bson.decode_file_iter(open(source_path, 'rb'),
                                codec_options=codec_options)
    json_list = []
    df_list = []
    i = 0
    for row in gen:
        cols = list(row.keys())
        for col in cols:
            if type(row[col]) is dict:
                if "id" in row[col]:
                    row[col] = row[col]["id"]
                else:
                    del row[col]
            elif type(row[col]) is str and len(row[col]) > limit_length:
                del row[col]
        json_list.append(row)
        if i > 0 and i % 10000 == 0:
            print(i)
            df = pd.DataFrame(json_list)
            df_list.append(df)
            json_list = None
            json_list = []
        i = i + 1
    if json_list:
        df = pd.DataFrame(json_list)
        df_list.append(df)
    if df_list:
        overall_df = pd.concat(df_list)
        for col in overall_df.columns:
            overall_df[col] = overall_df[col].astype(str)
        parquet_exporter.apply(overall_df,
                               target_path,
                               parameters={"compression": "gzip"})
Пример #15
0
def get_metrics(urls,
                req_method="GET",
                timeout=45,
                req_type='html',
                req_data=None):
    """ Sends http requests based on the provided urls and collect the corresponding metrics messages

    Keyword arguments:
    urls -- The URLS to call.
    req_method -- The Http method for the call
    timeout -- timeout used for waiting for metrics response, should be set slightly greater than the metrics reporting interval
    """
    print('Collecting metrics from udp socket {0}:{1}'.format(
        LISTEN_HOST, LISTEN_PORT))
    udpsock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    udpsock.settimeout(1)
    udpsock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    udpsock.bind((
        LISTEN_HOST,
        LISTEN_PORT,
    ))

    start_time = time.time()
    try:
        for url in urls:
            _make_call(url, req_method, req_data, req_type)
    except Exception as ex:
        print('error on {0}'.format(urls))
        return None

    metrics = []
    keep_receiving = True
    first_ping = False
    timer_5s = 0

    while keep_receiving:
        if (time.time() - start_time > timeout):
            print('Failed to collect metrics after waiting for {0}'.format(
                timeout))
            keep_receiving = False
        else:
            if timer_5s > 0 and (time.time() - timer_5s > 5):
                keep_receiving = False
        try:
            ret = udpsock.recv(BUFFER_SIZE)
            if ret:
                bson_data = bson.BSON(ret)
                try:
                    data = bson_data.to_dict(mdict)
                except AttributeError:
                    data = bson_data.decode(codec_options=bson.CodecOptions(
                        document_class=mdict))

                if "measurements" in data:  #then it's a metrics message
                    for m in data["measurements"]:
                        if "tags" in m or first_ping or "IsCustom" in m:
                            metrics.append(data)
                            data = None
                            if first_ping:
                                keep_receiving = False
                            elif timer_5s == 0:
                                timer_5s = time.time()
                            break
            else:
                print(
                    'Failed to collect metrics from udp socket {0}:{1} gave empty response'
                    .format(LISTEN_HOST, LISTEN_PORT))
        except socket.timeout:
            pass

    udpsock.close()
    return metrics
Пример #16
0
import lzma
import sys

import bson  # from PyMongo


with open("commits.bin", "wb") as commits:
    with lzma.open("repos.txt.xz", "wb") as repos:
        with lzma.open("messages.txt.xz", "wb") as messages:
            for obj in bson.decode_file_iter(sys.stdin.buffer, codec_options=bson.CodecOptions(
                    unicode_decode_error_handler="ignore")):
                commits.write(bytes.fromhex(obj["sha"]))
                repos.write(obj["commit"]["url"][29:-53].encode())
                repos.write(b"\0")
                messages.write(obj["commit"]["message"].encode(errors="ignore"))
                messages.write(b"\0")
Пример #17
0
import lzma
import sys

import bson  # from PyMongo

with open("commits.bin", "wb") as commits:
    with lzma.open("repos.txt.xz", "wb") as repos:
        with lzma.open("messages.txt.xz", "wb") as messages:
            while True:
                try:
                    for obj in bson.decode_file_iter(
                            sys.stdin.buffer,
                            codec_options=bson.CodecOptions(
                                unicode_decode_error_handler="ignore")):
                        try:
                            obj["sha"]
                            obj["commit"]["url"]
                            obj["commit"]["message"]
                        except KeyError:
                            continue
                        commits.write(bytes.fromhex(obj["sha"]))
                        repos.write(obj["commit"]["url"][29:-53].encode())
                        repos.write(b"\0")
                        messages.write(
                            obj["commit"]["message"].encode(errors="ignore"))
                        messages.write(b"\0")
                    break
                except bson.errors.InvalidBSON:
                    continue
Пример #18
0
def iter_elements(data, recurse=True):
    opts = bson.CodecOptions(document_class=bson.raw_bson.RawBSONDocument)
    for kpath, val in _iter_elements(data, opts, recurse):
        yield '.'.join(kpath), val
def getTZCodecOptions():
    return bson.CodecOptions(tz_aware=True,
                             tzinfo=pytz.timezone(app.config.get('TZ')))
Пример #20
0
def get_events_and_response(url,
                            num_expected=1000,
                            timeout=5,
                            strict=True,
                            inits_expected=True,
                            jmx_expected=False,
                            req_method="GET",
                            req_data=None,
                            req_type='html',
                            headers=None):
    """ Returns the events, init events, and the response object
        init events are in a list
        response object will have a 'code' property for when it is 200
        and also for "exotic http errors"
            read https://docs.python.org/2/library/urllib2.html for more info
        however, if the domain is not available at all (500 code),
        then the response will be the Exception object which will not have a code property
    Parameters:
    url -- The URL to call. Can be a str, urllib2.Request object, or callable() object.
    num_expected [1000] -- The number of events to expect. If more are found, an error is thrown.
    timeout [3] -- The socket listen timeout in seconds
    strict [True] -- only valid with num_expected; ensure that num == num_expected.
    inits_expected [True] -- If True, __Init event pairs will be filtered into the
    returned `inits` array.  The number of remaining events are checked against num_expected.
    Set to False to skip this processing.
    """
    func = url if callable(url) else functools.partial(
        _make_call, url, req_method, req_data, req_type, headers)

    with UdpCollector(LISTEN_HOST, LISTEN_PORT, timeout) as uc:
        start_time = time.time()
        full_data = []
        try:
            response = func()
        except Exception as ex:
            response = ex

        while len(full_data) < MAX_EVENTS:
            try:
                ret = uc.recv(BUFFER_SIZE)
                full_data.append(ret)
            except socket.timeout:
                break
            else:
                if time.time() - start_time > timeout:
                    break

    events = []
    for data in full_data:
        if data:
            bson_data = bson.BSON(data)
            try:
                # to_dict() is used for very old pymongo library
                # it has been replaced by decode() now.
                evts = bson_data.to_dict(mdict)
            except AttributeError:
                evts = bson_data.decode(codec_options=bson.CodecOptions(
                    document_class=mdict))
            events.append(evts)

    inits = []
    jmxs = []
    measurements = []

    # Only prints on failure when run through nose
    print('Events collected: {0}'.format(len(events)))
    _debug_print(events)

    _filter_measurements(events, measurements)
    print('Measurements after filtering:')
    _debug_print(measurements)

    if inits_expected:
        _filter_inits(events, inits)
        print('Inits after filtering:')
        _debug_print(inits)
    if jmx_expected:
        _filter_jmxs(events, jmxs)
        print('JMX events after filtering:')
        _debug_print(jmxs)
    print('Events after filtering:')
    _debug_print(events)
    if num_expected != -1:
        if len(events) > num_expected:
            raise ExtraEventsError(num_expected, len(events))
        if strict and len(events) != num_expected:
            raise MissingEventsError(num_expected, len(events))
    return events, inits, response
Пример #21
0
def get_events_via_udp(udp,
                       url,
                       num_expected=1000,
                       timeout=3,
                       strict=True,
                       inits_expected=True,
                       jmx_expected=False,
                       req_method="GET",
                       req_data=None,
                       req_type='html',
                       repeat_times=1):
    """ Wrapper for get_events_and_response() that just returns events.
    if req_method="GET", req_data will be ignored
    if req_method="POST", req_data could be a dictionary of values or an encoded string
        of values to be sent with the post request
    if req_method="PUT", req_data is a string that will be sent to the body
    :params url:string  request url
    """
    use_udp = not udp is None
    if udp is None:
        udp = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        udp.settimeout(timeout)
        udp.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        udp.bind((
            host,
            port,
        ))

    func = functools.partial(_make_call, url, req_method, req_data, req_type,
                             headers)

    start_time = time.time()
    full_data = []
    try:
        for _ in range(repeat_times):
            response = func()
    except Exception as ex:
        response = ex

    while len(full_data) < MAX_EVENTS:
        try:
            ret = udp.recv(BUFFER_SIZE)
            full_data.append(ret)
        except socket.timeout:
            break
        else:
            if time.time() - start_time > timeout:
                break
    events = []
    for data in full_data:
        if data:
            bson_data = bson.BSON(data)
            try:
                # to_dict() is used for very old pymongo library
                # it has been replaced by decode() now.
                evts = bson_data.to_dict(mdict)
            except AttributeError:
                evts = bson_data.decode(codec_options=bson.CodecOptions(
                    document_class=mdict))
            events.append(evts)

    inits = []
    jmxs = []
    measurements = []

    # Only prints on failure when run through nose
    print('Events collected: {0}'.format(len(events)))
    _debug_print(events)

    _filter_measurements(events, measurements)
    print('Measurements after filtering:')
    _debug_print(measurements)

    if inits_expected:
        _filter_inits(events, inits)
        print('Inits after filtering:')
        _debug_print(inits)
    if jmx_expected:
        _filter_jmxs(events, jmxs)
        print('JMX events after filtering:')
        _debug_print(jmxs)
    print('Events after filtering:')
    _debug_print(events)
    if num_expected != -1:
        if len(events) > num_expected:
            raise ExtraEventsError(num_expected, len(events))
        if strict and len(events) != num_expected:
            raise MissingEventsError(num_expected, len(events))
    return events, inits, response
Пример #22
0
 def get_collection(self, collection):
     if not self.is_login:
         raise AuthErr
     return self.__db[collection].with_options(
         codec_options=bson.CodecOptions(
             unicode_decode_error_handler="ignore"))