def calc_vector(filename):
    with open(filename, 'rb') as fp:
        jsr = JavascriptArtefact(url="file:{}".format(filename),
                                 sha256="XXX",
                                 md5="XXX")
        ret, failed, stderr = analyse_script(fp.read(),
                                             jsr,
                                             feature_extractor=args.extractor)
        if failed:
            raise Exception(stderr)
        return json.loads(ret.decode('utf-8'))
Exemplo n.º 2
0
def find_or_update_analysis_content(db,
                                    m,
                                    fail_iff_not_found=False,
                                    defensive=False,
                                    java=None,
                                    extractor=None,
                                    force=False):
    assert isinstance(m, dict)
    assert all([
        'js_id' in m, 'url' in m, 'sha256' in m, 'md5' in m, 'size_bytes' in m
    ])

    js_id = m.get('js_id')
    assert len(js_id) > 0

    # NB: due to an error in processing, I had to throw away the db.analysis_content collection, so records may be missing. Sigh 8th June 2020
    if not force:
        byte_content_doc = db.analysis_content.find_one({'js_id': js_id})
        if fail_iff_not_found and byte_content_doc is None:  # prevent infinite recursion
            raise ValueError("No data for {}".format(js_id))
        elif byte_content_doc is None:
            print("WARNING: unable to locate analysis content for {}".format(
                js_id))
    else:
        byte_content_doc = None

    if byte_content_doc is None:
        code_bytes, js_id = get_script(db, js_id)
        assert code_bytes is not None
        jsr = JavascriptArtefact(url=m.get('url'),
                                 sha256=m.get('sha256'),
                                 md5=m.get('md5'),
                                 size_bytes=m.get('size_bytes'),
                                 js_id=js_id)
        vector_as_bytes, failed, stderr = analyse_script(
            code_bytes, jsr, java=java, feature_extractor=extractor)
        if failed:
            raise ValueError("Could not analyse artefact: js_id={}\n{}".format(
                js_id, stderr))
        save_analysis_content(db, jsr, vector_as_bytes)

        if defensive:
            # check that artefact hashes match the actual content
            assert hashlib.sha256(code_bytes).hexdigest() == m.get('sha256')
        return find_or_update_analysis_content(
            db, m, fail_iff_not_found=True)  # this time it should be found!

    assert 'analysis_bytes' in byte_content_doc
    byte_content = byte_content_doc.get('analysis_bytes')
    assert isinstance(byte_content, bytes)
    return json.loads(byte_content.decode())
Exemplo n.º 3
0
def resolve_feature_vector(db, message):
    url = message.get('id')
    d = {'first_url': url}
    # 1. find url _id
    ret = db.urls.find_one(
        {'url': url}
    )  # TODO FIXME: do we care which timestamp of the url we get? nah... not for now
    if not ret:
        raise Exception("Could not find URL in MongoDB: {}".format(url))
    id = ret.get('_id')

    # 2. lookup url_id in script_urls collection
    ret = db.script_url.find_one({'url_id': id})
    if not ret:
        raise Exception("Could not find script_url in MongoDB: {} {}".format(
            id, url))
    script_id = ret.get('script')

    # 3. lookup script_id in scripts collection
    ret = db.scripts.find_one({'_id': script_id})
    if ret:
        d.update({
            'sha256': ret.get('sha256'),
            'md5': ret.get('md5'),
            'size_bytes': ret.get('size_bytes')
        })
    else:
        raise Exception("Could not find script {} {}".format(script_id, url))
    code = ret.get('code')

    # 4. finally we want to avoid re-computing the feature vector (slow) so we look it up in Mongo
    ret = db.statements_by_count.find_one({'url': url})
    if ret:
        d.update(**ret)
        d.pop('_id', None)
        d.pop('url', None)
    else:
        sha256 = message.get('sha256')
        md5 = message.get('md5')
        jsr = JavascriptArtefact(url=url, sha256=sha256, md5=md5)
        ret, failed, stderr = analyse_script(code, jsr)
        if not failed:
            d.update(**ret['statements_by_count'])
        else:
            raise Exception(stderr)

    return d
Exemplo n.º 4
0
def main(args, consumer=None, producer=None, db=None, cache=None):
    if args.v:
        print(args)
    if producer is None:
        producer = KafkaProducer(value_serializer=json_value_serializer(),
                                 bootstrap_servers=args.bootstrap)
    if cache is None:
        cache = pylru.lrucache(5000)

    if not os.path.exists(args.java):
        raise ValueError("Java executable does not exist: {}".format(
            args.java))
    if not os.path.exists(args.extractor):
        raise ValueError(
            "JAR file to extract features does not exist: {}".format(
                args.extractor))
    # we want only artefacts which are not cached and are JS (subject to maximum record limits)
    fv_cache = pylru.lrucache(args.cache) if args.cache > 0 else None
    if fv_cache is None:
        print("WARNING: not using FV cache - are you sure you wanted to?")

    n_cached = n_analysed = n_failed = 0
    is_first = True
    last_sha256 = last_js_id = None
    for jsr in iterate(consumer, args.n, cache, verbose=args.v):
        # eg.  {'url': 'https://XXXX.asn.au/', 'size_bytes': 294, 'inline': True, 'content-type': 'text/html; charset=UTF-8',
        #       'when': '2020-02-06 02:51:46.016314', 'sha256': 'c38bd5db9472fa920517c48dc9ca7c556204af4dee76951c79fec645f5a9283a',
        #        'md5': '4714b9a46307758a7272ecc666bc88a7', 'origin': 'XXXX' }  NB: origin may be none for old records (sadly)
        assert isinstance(jsr, JavascriptArtefact)

        if last_sha256 == jsr.sha256 and not last_js_id == jsr.js_id:
            print("WARNING: shifting js_id despite hash match: {} {} != {}".
                  format(jsr.sha256, jsr.js_id, last_js_id))

        # got results cache hit ??? Saves computing it again and hitting the DB, which is slow...
        if fv_cache is not None and jsr.js_id in fv_cache:
            byte_content, js_id = fv_cache[jsr.js_id]
            n_cached += 1
            fv_cache[js_id] = (
                byte_content, js_id
            )  # make js_id record MRU ie. least likely to be evicted
            # FALLTHRU
        else:
            # obtain and analyse the JS from MongoDB and add to list of analysed artefacts topic. On failure lodge to feature extraction failure topic
            js, js_id = get_script(db, jsr)
            if js is None:
                report_failure(producer, jsr, 'Could not locate in MongoDB')
                n_failed += 1
                continue
            if args.defensive:
                # validate that the data from mongo matches the expected hash or die trying...
                assert hashlib.sha256(js).hexdigest() == jsr.sha256
                assert hashlib.md5(js).hexdigest() == jsr.md5
                assert len(js) == jsr.size_bytes

            byte_content, failed, stderr = analyse_script(
                js, jsr, java=args.java, feature_extractor=args.extractor)
            n_analysed += 1
            if failed:
                report_failure(producer, jsr,
                               "Unable to analyse script: {}".format(stderr))
                n_failed += 1
                continue
            # put results into the fv_cache and then FALLTHRU...
            if fv_cache is not None:
                fv_cache[js_id] = (byte_content, js_id)

        assert js_id == jsr.js_id
        assert isinstance(js_id, str) and isinstance(
            jsr.js_id, str) and len(jsr.js_id) > 0
        save_analysis_content(db,
                              jsr,
                              byte_content,
                              ensure_indexes=is_first,
                              iff_not_exists=True)
        save_to_kafka(producer,
                      asdict(jsr),
                      to=args.to,
                      key=jsr.sha256.encode('utf-8'))
        is_first = False
        last_js_id = js_id
        last_sha256 = jsr.sha256

    print("Analysed {} artefacts, {} failed, {} cached, now={}".format(
        n_analysed, n_failed, n_cached, str(datetime.now())))
    consumer.close(autocommit=False)
    cleanup([mongo])
    return 0
Exemplo n.º 5
0
   args = a.parse_args()
   mongo = pymongo.MongoClient(args.db, args.port, username=args.dbuser, password=str(args.dbpassword))
   db = mongo[args.dbname]
   cursor = db.javascript_control_code.find({}, no_cursor_timeout=True) # long-running find so we try to avoid it being killed prematurely...
   with cursor:
       for rec in cursor:
           assert 'code' in rec
           control_url = rec.get('origin')
           assert control_url.startswith("http")
           if args.v:
               print(control_url)

           # recalculate vector?
           if args.recalc:
               jsr = JavascriptArtefact(url=control_url, sha256='XXX', md5='YYY', inline=False)  # only url matters for analyse script
               vectors, failed, stderr = analyse_script(rec.get('code'), jsr, java=args.java, feature_extractor=args.extractor)
               assert not failed
               assert isinstance(vectors, bytes)
               print(jsr.url)
               required_hash = hashlib.sha256(vectors).hexdigest()
               db.javascript_control_code.update_one({ '_id': rec.get('_id') }, 
                                                 { "$set": { "analysis_bytes": Binary(vectors) } })
           else: 
               vectors = rec.get('analysis_bytes')
               assert vectors is not None
               assert isinstance(vectors, bytes)
           
           d = json.loads(vectors)
           update_control_summary(db, control_url, d['statements_by_count'], d['calls_by_count'], d['literals_by_count'])
           db.javascript_controls.update_one({ 'origin': control_url }, { '$set': { 
                'sha256': hashlib.sha256(rec['code']).hexdigest(),
    assert 'origin_js_id' in hit
    js_id = hit.get('origin_js_id')
    if js_id is None:
        print("Bad data - no origin_js_id... skipping".format(hit)) 
        continue

    ret = db.scripts.find_one({ '_id': ObjectId(js_id) })
    if ret is None:  # should not happen... but if it does...
        print("Unable to locate {} is db.scripts... skipping".format(js_id))
        continue
    content = ret.get('code') 
    jsr = JavascriptArtefact(url=hit.get('origin_url'), 
                             sha256=hashlib.sha256(content).hexdigest(), 
                             md5=hashlib.md5(content).hexdigest(), 
                             inline=False)
    m, failed, stderr = analyse_script(content, jsr, java=args.java, feature_extractor=args.extractor)
    if failed:
       n_failed += 1
       continue
    m.update({ 'origin': hit.get('cited_on'), 'js_id': js_id })
    assert 'js_id' in m and len(m['js_id']) > 0  # PRE-CONDITION: ensure hits have origin_js_id field set
    best_control, next_best_control = find_best_control(m, all_controls, db=db)
    d = asdict(best_control) # NB: all fields of the model are sent to output kafka topic and Mongo

    # 2a. also send results to MongoDB for batch-oriented applications and for long-term storage
    # POST-CONDITIONS which MUST be maintained are checked before pushing to topic
    assert 'cited_on' in d and len(d['cited_on']) > 0
    assert 'origin_url' in d and len(d['origin_url']) > 0
    assert isinstance(d['origin_js_id'], str) or d['origin_js_id'] is None
    ret = db.vet_against_control.find_one_and_update({ 'origin_url': best_control.origin_url }, 
                                                     { "$set": d}, 
Exemplo n.º 7
0
def save_control(db,
                 url,
                 family,
                 variant,
                 version,
                 force=False,
                 refuse_hashes=None,
                 provider='',
                 java='/usr/bin/java',
                 feature_extractor=None,
                 content=None):
    """
   Update all control related data. Note callers must supply refuse_hashes (empty set) or an error will result

   Returns JavascriptArtefact representing control which has had its state updated into MongoDB
   """
    assert url is not None
    assert family is not None
    assert version is not None
    if content is None:
        resp = requests.get(url)
        if resp.status_code != 200:
            raise ValueError("Failed to fetch [{}] {}".format(
                resp.status_code, url))
        content = resp.content

    sha256 = hashlib.sha256(content).hexdigest()
    md5 = hashlib.md5(content).hexdigest()
    jsr = JavascriptArtefact(when=str(datetime.utcnow()),
                             sha256=sha256,
                             md5=md5,
                             url=url,
                             inline=False,
                             content_type='text/javascript',
                             size_bytes=len(content))
    if jsr.size_bytes < 1000:
        print(
            "Refusing artefact as too small to enable meaningful vector comparison: {}"
            .format(jsr))
        return jsr

    if not force and jsr.sha256 in refuse_hashes:
        print("Refusing to update existing control as dupe: {}".format(jsr))
        return jsr

    bytes_content, failed, stderr = analyse_script(
        content, jsr, java=java, feature_extractor=feature_extractor)
    if failed:
        raise ValueError('Could not analyse script {} - {}'.format(
            jsr.url, stderr))
    ret = json.loads(bytes_content.decode())
    cntrl_url, subfamily = identify_control_subfamily(jsr.url)
    ret.update({
        'family': family,
        'release': version,
        'variant': variant,
        'origin': url,
        'sha256': sha256,
        'md5': md5,
        'size_bytes': len(content),
        'do_not_load':
        False,  # all controls loaded by default except alpha/beta/release candidate
        'provider': provider,
        'subfamily': subfamily
    })
    #print(ret)
    assert 'sha256' in ret
    assert 'md5' in ret
    assert 'size_bytes' in ret

    # NB: only one control per url/family pair (although in theory each CDN url is enough on its own)
    resp = db.javascript_controls.find_one_and_update(
        {
            'origin': url,
            'family': family
        }, {"$set": ret}, upsert=True)
    db.javascript_control_code.find_one_and_update({'origin': url}, {
        "$set": {
            'origin': url,
            'code': Binary(content),
            'analysis_bytes': bytes_content,
            "last_updated": jsr.when
        }
    },
                                                   upsert=True)
    update_control_summary(db, url, ret['statements_by_count'],
                           ret['calls_by_count'], ret['literals_by_count'])
    return jsr