def ClusterHouses(matches, plot_groups=False): groups = {} try: N = len(matches) X = np.zeros((N, 2)) for m in range(N): loc = RFAPI.house_location(matches[m]) #logging.debug("ClusterHouses({})".format(loc)) X[m] = (loc[0], loc[1]) params = { 'quantile': .3, 'eps': .15, 'damping': .9, 'preference': -5, 'n_neighbors': 2, 'n_clusters': 5 } # a bit buggy.. spectral = cluster.SpectralClustering( n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") # best so far! gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') # yielded one cluster.. affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) algorithm = ms algorithm.fit(X) if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) for m in range(len(matches)): key = str(y_pred[m]) if groups.get(key, None) == None: groups[key] = [] groups[key].append({ "adress": RFAPI.house_address(matches[m]), "location": [X[m][0], X[m][1]] }) logging.debug("groups = {}".format(groups)) if plot_groups: HouseScore._plot_groups(X, y_pred) except Exception as e: groups["error"] = str(e) logging.error(groups["error"]) return groups
def main(): options, args = parse_arguments() query = build_query(options, args) api = RFAPI(options.token) substitute_fields = ["attributes"] output_columns = [ "id", "momentum", "positive", "negative", "canonical.id", "type", "document.id", "document.published", "document.downloaded", "start", "stop", "document.url", "document.title", "document.sourceId.id", "document.sourceId.name", "document.sourceId.media_type", "document.sourceId.topic", "document.sourceId.country", "fragment", "attributes", ] entity_columns = ["id", "name", "hits", "type", "momentum", "attributes"] out = csv.DictWriter(sys.stdout, output_columns, extrasaction="ignore") if query.get("aggregate") or query.get("output", {}).get("count"): res = api.query(query) print res else: if options.header: out.writerow(dict(zip(output_columns, output_columns))) if options.entityfile: entityout = csv.DictWriter(open(options.entityfile, "w"), entity_columns, extrasaction="ignore") entityout.writerow(dict(zip(entity_columns, entity_columns))) for res in api.paged_query(query): for i in res["instances"]: i["positive"] = i.get("attributes", {}).get("positive", 0.0) i["negative"] = i.get("attributes", {}).get("negative", 0.0) out.writerow(encode_instance(flatten_instance(i, res["entities"], substitute_fields))) if options.entityfile: entities = pack_entity_attributes(res["entities"], entity_columns) for e in entities: # Here we reuse the instance formatting code to format entities for output. entityout.writerow(encode_instance(flatten_instance(e, res["entities"], []))) if not options.page: break
def main(): options, args = parse_arguments() query = build_query(options, args) api = RFAPI(options.token) substitute_fields = ['attributes'] output_columns = [ 'id', 'momentum', 'positive', 'negative', 'canonical.id', 'type', 'document.id', 'document.published', 'document.downloaded', 'start', 'stop', 'document.url', 'document.title', 'document.sourceId.id', 'document.sourceId.name', 'document.sourceId.media_type', 'document.sourceId.topic', 'document.sourceId.country', 'fragment', 'attributes' ] entity_columns = ['id', 'name', 'hits', 'type', 'momentum', 'attributes'] out = csv.DictWriter(sys.stdout, output_columns, extrasaction='ignore') if query.get('aggregate') or query.get('output', {}).get('count'): res = api.query(query) print res return if options.header: out.writerow(dict(zip(output_columns, output_columns))) if options.entityfile: entityout = csv.DictWriter(open(options.entityfile, 'w'), entity_columns, extrasaction='ignore') entityout.writerow(dict(zip(entity_columns, entity_columns))) for res in api.paged_query(query): for i in res['instances']: i['positive'] = i.get('attributes', {}).get('positive', 0.0) i['negative'] = i.get('attributes', {}).get('negative', 0.0) out.writerow( encode_instance( flatten_instance(i, res['entities'], substitute_fields))) if options.entityfile: entities = pack_entity_attributes(res['entities'], entity_columns) for e in entities: #Here we reuse the instance formatting code to format entities for output. entityout.writerow( encode_instance(flatten_instance(e, res['entities'], []))) if not options.page: break
def distance(self, house, details): ret = 0.0 median, div, cutoff, weight = self._get_measure_facts("distance") fav_len = len(self.fav_locations) if fav_len == 0 or house.get("parcel") is None or house.get( "parcel").get("longitude") is None: house_address = RFAPI.house_address(house) if house_address != "": loc = AddressToLocation(house_address) if loc is not None and len(loc) == 2: house["parcel"] = {"latitude": loc[0], "longitude": loc[1]} return HouseScoreResult(cutoff * weight, cutoff, False, "Can't measure distance") for fav in self.fav_locations: dist = LocationDistance( [house["parcel"]["latitude"], house["parcel"]["longitude"]], fav["loc"]) diff = dist / div ret = ret + diff distance_average = ret / fav_len money = distance_average * weight message = "" if distance_average <= cutoff else "distance {}(mil) is larger than cut of {}(mil)".format( distance_average, cutoff) return HouseScoreResult(money, distance_average, (distance_average < cutoff), message)
def post_process_one(self, m, search_name, get_details=False, force=False): house_details = None if get_details: house_details = self.rfapi.get_house_details(m, force=force, cache_time_format="") #logging.debug("post_process(get_details={})=>{}".format(get_details, house_details)) RFAPI.generate_url_for_house(m) ha = RFAPI.house_address_parts(m) scores = None try: scores = self.get_scores(m, house_details) except: logging.error("house : {}, throw {}".format( json.dumps(m), traceback.format_exc())) if scores is None: return m m['scores'] = scores is_good, message = HouseScore.get_house_score_message(m) if is_good: if house_details is None: house_details = self.rfapi.get_house_details( m, force=force, cache_time_format="") scores = self.get_scores(m, house_details) house_neighborhoods = RFAPI.house_neighborhoods(m, house_details) m['scores'] = scores is_good, message = HouseScore.get_house_score_message(m) logging.info(message) # getting city data takes a long time, will do it only for winning houses! """ self.city_data.get_data( house_address= ha['display'] , city=ha['city'] # Everett , state_short=ha['state'] , zip_code=ha['zip'] , house_neighborhoods=house_neighborhoods , force=force) """ else: logging.debug(message) return m
def __init__(self, token, iocs, entity_type, mode='core'): ''' Parameters ---------- token : str Recorded Future API token iocs : list or dict List of IOCs to enrich or dict of IOCs keyed by name with the value as the RFID. entity_type : {"IpAddress", "Hash", "InternetDomainName"} Name of Recorded Future entity type for IOC. mode : {"core", "related", "debug"} Subset of features to return with enrichment. "core" is default. ''' self.rfqapi = RFAPI(token) self.response = collections.OrderedDict() # need all features early for scoring; they're removed later # need to test whether this can be avoided keys = self._FEATURES['core'] keys.update(self._FEATURES['debug']) if mode in ('related', 'debug'): keys.update(self._FEATURES['related']) if mode not in ('core', 'related', 'debug'): raise ValueError( '"mode" must be one of ("core", "related", "debug"). Input: %s.' % mode) self.mode = mode self.entity_type = entity_type if isinstance(iocs, list): self.iocs = self._get_rfids(iocs) elif isinstance(iocs, dict): self.iocs = iocs else: raise ValueError('"iocs" must be list or dict.') for ioc in self.iocs: new_resp = {} for key in keys: new_resp[key] = keys[key] if key == 'Name': new_resp[key] = ioc elif key == 'RFID': new_resp[key] = self.iocs[ioc] elif key == 'EntityType': new_resp[key] = self.entity_type self.response[ioc] = new_resp self.keys = keys
def get_scores(self, house, details=None): scores = {} total_score = 0.0 for k in self.default_fields: if k == "value": continue method = getattr(self, k, None) if method is None: continue result = method(house, details) scores[k] = dict(**result._asdict()) total_score += result.money if not result.accepted: scores["cutoff"] = ",".join( filter(None, [scores.get("cutoff"), k])) # value evaluation must be run last result = self.value(house, details, total_score) scores["value"] = dict(**result._asdict()) if not result.accepted: scores["cutoff"] = ",".join( filter(None, [scores.get("cutoff"), "value"])) scores["facts"] = { "build": house.get("year_built", -1), "full_address": RFAPI.house_address(house), "beds": house["beds"], "sqft": house["sqft"] if house.get("sqft") is not None else 0.0, "baths": house.get("baths", 0), "price": house["price"], "County": RFAPI.house_county(house, details), "photo": RFAPI.house_photo_url(house, details), "neighborhoods": RFAPI.house_neighborhoods(house, details) if details is not None else [] } return scores
def __init__(self, default_fields=None, fav_locations=None, cahe_folder=CACHE_DIR): super(HouseScore, self).__init__() self.cahe_folder = cahe_folder self.rfapi = RFAPI(cahe_folder=cahe_folder) if default_fields is not None: self.default_fields = default_fields else: self.default_fields = HouseScore.DEFAULTS.copy() if fav_locations is not None: self.fav_locations = fav_locations else: self.fav_locations = HouseScore.LoadFavorits( os.path.join(SCRIPT_DIR, "FavoriteLocations.json"))
def main(): options, args = parse_arguments() query = build_query(options, args) api = RFAPI(options.token) substitute_fields = ['attributes'] output_columns = ['id', 'momentum', 'positive', 'negative', 'canonical.id', 'type', 'document.id', 'document.published', 'document.downloaded', 'start', 'stop', 'document.url','document.title', 'document.sourceId.id', 'document.sourceId.name', 'document.sourceId.media_type', 'document.sourceId.topic', 'document.sourceId.country', 'fragment', 'attributes'] entity_columns = ['id', 'name', 'hits', 'type', 'momentum', 'attributes'] out = csv.DictWriter(sys.stdout, output_columns, extrasaction='ignore') if query.get('aggregate') or query.get('output', {}).get('count'): res = api.query(query) print res return if options.header: out.writerow(dict(zip(output_columns, output_columns))) if options.entityfile: entityout = csv.DictWriter(open(options.entityfile, 'w'), entity_columns, extrasaction='ignore') entityout.writerow(dict(zip(entity_columns, entity_columns))) for res in api.paged_query(query): for i in res['instances']: i['positive'] = i.get('attributes', {}).get('positive', 0.0) i['negative'] = i.get('attributes', {}).get('negative', 0.0) out.writerow(encode_instance(flatten_instance(i, res['entities'], substitute_fields))) if options.entityfile: entities = pack_entity_attributes(res['entities'], entity_columns) for e in entities: #Here we reuse the instance formatting code to format entities for output. entityout.writerow(encode_instance(flatten_instance(e, res['entities'], []))) if not options.page: break
def get_all_iocs(token, e_type, index_min, index_max): '''Gets all entities of type e_type found between index_min and index_max ''' rfqapi = RFAPI(token) q = {"instance": {"type": "Event", "attributes": [{"entity": {"type": e_type}}], "document": {"indexed": {"min": index_min, "max": index_max}}}, "output": {"count": {"axis":[{"name":"attributes.entities", "type":e_type, "aspect":"all"}], "values":["instances"]}}} res = rfqapi.query(q) iocs = res["counts"][0].keys() ioc_dict = {} for ioc in iocs: ioc_name, rfid, unused = rf_agg_name_parser(ioc) ioc_dict[ioc_name] = rfid return ioc_dict
def main(): # Construct a RFAPI query object rfqapi = RFAPI(TOKEN) # Query for the metadata mdata_result = rfqapi.paged_query(q) # Loop over all the metadata and each metadata attributes for metadata in mdata_result: mdata_types = metadata['types'] for md_type in mdata_types: # Print each Root Metadata Type parent_type="" if 'parent' in md_type: parent_type = str(md_type['parent']) print md_type['name']+"("+parent_type+")" # Loop over attributes in this metadata type and print their corresponding types for md_attr_list in md_type['attrs']: print_attributes(md_attr_list)
def get_house_score_message(m): scores = m['scores'] id_str = RFAPI.house_address(m) if m.get("URL") is not None: id_str = "{} : http://www.redfin.com{}".format(id_str, m["URL"]) if scores.get("cutoff") is not None: return False, "{} => Cut for {{ {} }} score = {}".format( id_str, scores["cutoff"], scores) else: return True, ("{} => {}".format(id_str, scores))
def get_all_iocs(token, e_type, index_min, index_max): '''Gets all entities of type e_type found between index_min and index_max ''' rfqapi = RFAPI(token) q = { "instance": { "type": "Event", "attributes": [{ "entity": { "type": e_type } }], "document": { "indexed": { "min": index_min, "max": index_max } } }, "output": { "count": { "axis": [{ "name": "attributes.entities", "type": e_type, "aspect": "all" }], "values": ["instances"] } } } res = rfqapi.query(q) iocs = res["counts"][0].keys() ioc_dict = {} for ioc in iocs: ioc_name, rfid, unused = rf_agg_name_parser(ioc) ioc_dict[ioc_name] = rfid return ioc_dict
def __init__(self, token, iocs, entity_type, mode='core'): ''' Parameters ---------- token : str Recorded Future API token iocs : list or dict List of IOCs to enrich or dict of IOCs keyed by name with the value as the RFID. entity_type : {"IpAddress", "Hash", "InternetDomainName"} Name of Recorded Future entity type for IOC. mode : {"core", "related", "debug"} Subset of features to return with enrichment. "core" is default. ''' self.rfqapi = RFAPI(token) self.response = collections.OrderedDict() # need all features early for scoring; they're removed later # need to test whether this can be avoided keys = self._FEATURES['core'] keys.update(self._FEATURES['debug']) if mode in ('related', 'debug'): keys.update(self._FEATURES['related']) if mode not in ('core', 'related', 'debug'): raise ValueError('"mode" must be one of ("core", "related", "debug"). Input: %s.' % mode) self.mode = mode self.entity_type = entity_type if isinstance(iocs, list): self.iocs = self._get_rfids(iocs) elif isinstance(iocs, dict): self.iocs = iocs else: raise ValueError('"iocs" must be list or dict.') for ioc in self.iocs: new_resp = {} for key in keys: new_resp[key] = keys[key] if key == 'Name': new_resp[key] = ioc elif key == 'RFID': new_resp[key] = self.iocs[ioc] elif key == 'EntityType': new_resp[key] = self.entity_type self.response[ioc] = new_resp self.keys = keys
risk = int(row['Risk']) if risk >= args.ip_risk_floor: ip_form = row['Name'] if '/' in ip_form: # We don't want to include CIDR ranges. continue print('\t'.join([ ip_form, 'Intel::ADDR', meta_source, intel_summ_link('ip:'+ip_form), do_notice, '-' ])) c += 1 # Hashes. api = RFAPI(args.token) hash_query = { "cluster": { "data_group": "Hash", "limit": 10000, "attributes": [ { "name": "stats.metrics.riskScore", "range": { "gte": args.hash_risk_floor } } ] }, "output": { "exclude": [
class RFEnricher(object): pattern_to_ioc = { '{0}|{1}'.format(ipv4_regexp, ipv6_regexp): { 'data_group': 'EnrichIpAddress', 'function': 'enriched-ip-address', 'id_prefix': 'ip' }, idn_regexp: { 'data_group': 'EnrichInternetDomainName', 'function': 'enriched-internet-domain-name', 'id_prefix': 'idn' }, hash_regexp: { 'data_group': 'EnrichHash', 'function': 'enriched-hash', 'id_prefix': 'hash' } } def __init__(self, token): self.rfapi = RFAPI(token) def enrich(self, iocs): sys.stderr.write('Enriching {0} IOC(s)...\n'.format(len(iocs))) enrichment = {} for ioc in iocs: for pattern, query_config in self.pattern_to_ioc.items(): if not re.match(pattern, ioc): continue sys.stderr.write('\tProcessing {0} : {1}... '.format(query_config['id_prefix'], ioc)) enrichment[ioc] = self.query_enrich_ioc(ioc, query_config) sys.stderr.write('Done.\n') break else: sys.stderr.write('Unable to match "{0}" with any supported IOC type.\n'.format(ioc)) return enrichment def get_entity_id(self, id_prefix, name): if id_prefix != 'hash': return "{0}:{1}".format(id_prefix, name) res = self.rfapi.query({ 'entity': { 'name': name, 'type': 'Hash', 'limit': 1 } }) if len(res.get('entities', [])) == 0: return None return res['entities'][0] def query_enrich_ioc(self, text, query_config): entity_id = self.get_entity_id(query_config['id_prefix'], text) if not entity_id: return "No enrichment available." q = { "cluster": { "function": query_config['function'], "attributes": [ { "entity": { "id": entity_id } } ], "limit": 1, "data_group": query_config['data_group'] }, "output": { "inline_entities": True } } res = self.rfapi.query(q) if res['count']['events']['total'] == 0: return "No enrichment available." enr_data = res['events'][0]['stats'] enr_data['rf_link'] = 'https://www.recordedfuture.com/live/sc/entity/' + entity_id return enr_data
for row in csv_fd: risk = int(row['Risk']) if risk >= args.ip_risk_floor: ip_form = row['Name'] if '/' in ip_form: # We don't want to include CIDR ranges. continue print('\t'.join([ ip_form, 'Intel::ADDR', meta_source, intel_summ_link('ip:' + ip_form), do_notice, '-' ])) c += 1 # Hashes. api = RFAPI(args.token) hash_query = { "cluster": { "data_group": "Hash", "limit": 10000, "attributes": [{ "name": "stats.metrics.riskScore", "range": { "gte": args.hash_risk_floor } }] }, "output": { "exclude": ["stats.entity_lists"],
def main(): options, args = parse_arguments() query = build_query(options, args) api = RFAPI(options.token) res = api.query(query) print res
def __init__(self, token): self.rfapi = RFAPI(token)
for f in toplists: w = csv.writer(open(f + '.csv', 'wb'), dialect='toplists') header = ['Name', 'Hits'] if bools['assoc']: header += ['Associated Entities'] header += ['Link'] w.writerow(header) w.writerows(toplists[f]) def get_arguments(): parser = argparse.ArgumentParser(description='Pull top lists from Recorded Future.') parser.add_argument('token', help="Recorded Future API token.") parser.add_argument('n', help="Number of results to return per query.", type=int) parser.add_argument('period', help="Number of days back to query.", type=int) parser.add_argument('query_file', nargs='+', help="Query files.") parser.add_argument('-assoc', '--assoc', help="Include associated products, techs, malware.", action='store_true') parser.add_argument('-new', '--new', help="Include only entities first seen in the last %s days." % emerging_threshold.days, action='store_true') return parser.parse_args() if __name__ == '__main__': args = get_arguments() token = args.token n = args.n period = args.period files = args.query_file bools = {"assoc": args.assoc, "new": args.new} rfqapi = RFAPI(token) queries = get_queries(files) toplists = get_toplists(queries, rfqapi, n, period, bools) write_details(toplists, bools)
"not": { "ip": "192.168.0.0/16" } }, { "not": { "ip": "127.0.0.1" } }, { "not": { "ip": "0.0.0.0" } }], "limit": 10 }, "output": { "exclude": ["stats"], "inline_entities": True } } # Using RFAPI module, run query # Note: To pull back all results, use rfqapi.paged_query(q) # and a higher limit. rfqapi = RFAPI(token) result = rfqapi.query(q) # Display the results (in this case, limit is 1) for res in result['events']: print "Event: \n" print str(res) + '\n'
class RFEnricher(object): pattern_to_ioc = { '{0}|{1}'.format(ipv4_regexp, ipv6_regexp): { 'data_group': 'IpAddress', 'id_prefix': 'ip' }, idn_regexp: { 'data_group': 'InternetDomainName', 'id_prefix': 'idn' }, hash_regexp: { 'data_group': 'Hash', 'id_prefix': 'hash' } } def __init__(self, token): self.rfapi = RFAPI(token) def enrich(self, iocs): sys.stderr.write('Enriching {0} IOC(s)...\n'.format(len(iocs))) enrichment = {} for ioc in iocs: for pattern, query_config in self.pattern_to_ioc.items(): if not re.match(pattern, ioc): continue sys.stderr.write('\tProcessing {0} : {1}... '.format( query_config['id_prefix'], ioc)) enrichment[ioc] = self.query_enrich_ioc(ioc, query_config) sys.stderr.write('Done.\n') break else: sys.stderr.write( 'Unable to match "{0}" with any supported IOC type.\n'. format(ioc)) return enrichment def get_entity_id(self, id_prefix, name): if id_prefix != 'hash': return "{0}:{1}".format(id_prefix, name) res = self.rfapi.query( {'entity': { 'name': name, 'type': 'Hash', 'limit': 1 }}) if len(res.get('entities', [])) == 0: return None return res['entities'][0] def query_enrich_ioc(self, text, query_config): entity_id = self.get_entity_id(query_config['id_prefix'], text) if not entity_id: return "No enrichment available." q = { "cluster": { "attributes": [{ "entity": { "id": entity_id } }], "limit": 1, "data_group": query_config['data_group'] }, "output": { "inline_entities": True } } res = self.rfapi.query(q) if res['count']['events']['total'] == 0: return "No enrichment available." enr_data = res['events'][0]['stats'] enr_data[ 'rf_link'] = 'https://www.recordedfuture.com/live/sc/entity/' + entity_id return enr_data
class HouseScore(object): DEFAULTS = { "distance": { "median": 8.0, # stay in King County "div": 0.5, "weight": -9000.0, # one mile costs ( $900 / year ) * 10 years "cutoff": 18.0 }, "area": { "median": 2700.0, # We are looking for a house around this area "div": 1.0, "weight": 225.0, # a good house would cost $275 per SF "cutoff": 1900.0 }, "build": { "median": 2018.0, "div": 1.0, "weight": -2000.0, "cutoff": 1990.0 }, "beds": { "median": 4.0, "div": 1.0, "weight": 10000.0, # for each extra room you get 10K "cutoff": 3.0 }, "backyard": { "median": 2500.0, # not used "div": 1.0, "weight": 5.0, # I'd pay extra 25K for 5000 SF back yard "cutoff": 500.0 }, "crime": { "median": 236.5, "div": 1.0, "weight": 1.0, "cutoff": -10.0 }, "history": { "median": 7.0, "div": 1000.0, # lose 0.1% every day on market "weight": 1.0, # penelty = dom * price / div "cutoff": 155.0, # if no human finds this house good for 5 months, don't consider it! "history_days": 365.0, "pending_penelty": 10000.0, "inspection_penelty": 25000.0, "delisted_penelty": 5000.0 }, "layout": { "median": 0.0, # not used "div": 0.01, "weight": 1.0, "cutoff": 0.0, # not used "beds_min": 3.0, "bed_bonus": 10000.0, "baths_min": 2.5, "bath_bonus": 2000.0, "required": { "Attached Garage": 5000.0, "Living Room": 5000.0, "Dining Room": 5000.0 }, "optional": { "Bonus Room": 1000.0, "Family Room": 5000.0, "Recreation Room": 1000.0, "Walk-In Closet": 1000.0, "Utility Room": 1000.0, "Loft": 1000.0, "Den": 3000.0, "Office": 1000.0 } }, "amenities": { "median": 95.0, # not used "div": 0.01, "weight": 1.0, "cutoff": 80.0, # should have 80% of what we are looking for "required": { "Forced Air Heating": 10000.0, "Dishwasher": 2000.0, "Dryer": 1000.0, "Oven": 1000.0, "Refrigerator": 3000.0, "'Washer'": 1000.0, "Public Water Source": 5000.0, "Sewer Connected": 5000.0, "Garbage Disposal": 1000.0, "High Speed Internet": 5000.0 }, "optional": { "Microwave": 500.0, "Composition Roof": 2000.0, "Central Air Conditioning": 10000.0, "'King County'": 100000.0, "'Bothell'": 50000.0, "'Kenmore'": 50000.0, "'Brier'": 20000.0 } }, "value": { # always run this last "total_cutoff": 500000.0, "percentage_cutoff": -50.0 } } def __init__(self, default_fields=None, fav_locations=None, cahe_folder=CACHE_DIR): super(HouseScore, self).__init__() self.cahe_folder = cahe_folder self.rfapi = RFAPI(cahe_folder=cahe_folder) if default_fields is not None: self.default_fields = default_fields else: self.default_fields = HouseScore.DEFAULTS.copy() if fav_locations is not None: self.fav_locations = fav_locations else: self.fav_locations = HouseScore.LoadFavorits( os.path.join(SCRIPT_DIR, "FavoriteLocations.json")) def value(self, house, details=None, total_score=0.0): total_cutoff = self.default_fields["value"]["total_cutoff"] percentage_cutoff = self.default_fields["value"]["percentage_cutoff"] details_available = details is not None message = [] accepted = True if total_score < total_cutoff and details_available: accepted = False message.append("house valued at {} which is less than {}".format( total_score, total_cutoff)) gain = total_score - float(house["price"]) gain_percentage = gain / total_score * 100 if gain_percentage < percentage_cutoff and details_available: accepted = False message.append("house ROI of {} which is less than {}".format( gain_percentage, percentage_cutoff)) return HouseScoreResult(total_score, gain_percentage, accepted, ", ".join(message)) def get_scores(self, house, details=None): scores = {} total_score = 0.0 for k in self.default_fields: if k == "value": continue method = getattr(self, k, None) if method is None: continue result = method(house, details) scores[k] = dict(**result._asdict()) total_score += result.money if not result.accepted: scores["cutoff"] = ",".join( filter(None, [scores.get("cutoff"), k])) # value evaluation must be run last result = self.value(house, details, total_score) scores["value"] = dict(**result._asdict()) if not result.accepted: scores["cutoff"] = ",".join( filter(None, [scores.get("cutoff"), "value"])) scores["facts"] = { "build": house.get("year_built", -1), "full_address": RFAPI.house_address(house), "beds": house["beds"], "sqft": house["sqft"] if house.get("sqft") is not None else 0.0, "baths": house.get("baths", 0), "price": house["price"], "County": RFAPI.house_county(house, details), "photo": RFAPI.house_photo_url(house, details), "neighborhoods": RFAPI.house_neighborhoods(house, details) if details is not None else [] } return scores def _get_measure_facts(self, measure_name): median = self.default_fields[measure_name]["median"] div = self.default_fields[measure_name]["div"] cutoff = self.default_fields[measure_name]["cutoff"] weight = self.default_fields[measure_name]["weight"] return (median, div, cutoff, weight) def area(self, house, details): median, div, cutoff, weight = self._get_measure_facts("area") if house.get("sqft") is None: return HouseScoreResult(0.0, 0.0, False, "missing 'sqft'") area = house["sqft"] money = area * weight message = "" if area >= cutoff else "area {}(sf) is less than cut of {}(sf)".format( area, cutoff) return HouseScoreResult(money, area, (area >= cutoff), message) def build(self, house, details): median, div, cutoff, weight = self._get_measure_facts("build") if house.get("year_built") is None: return HouseScoreResult(0.0, 0.0, False, "missing 'year_built'") year_built = house["year_built"] money = (median - year_built) / div * weight message = "" if year_built >= cutoff else "House is built in {}, older than cut of {}".format( year_built, cutoff) return HouseScoreResult(money, year_built, (year_built >= cutoff), message) def backyard(self, house, details): if house.get("lotsize") is None or house.get("sqft") is None: return HouseScoreResult(0.0, 0.0, False, "missing 'lotsize' or 'sqft'") median, div, cutoff, weight = self._get_measure_facts("backyard") remaining_for_backyard = float(house["lotsize"]) - float(house["sqft"]) money = remaining_for_backyard * weight message = "" if remaining_for_backyard >= cutoff else "backyard {}(sf) is less than cut of {}(sf)".format( remaining_for_backyard, cutoff) return HouseScoreResult(money, remaining_for_backyard, (remaining_for_backyard >= cutoff), message) def amenities(self, house, details): return self.amenitiesInfo("amenities", house, details) def layout(self, house, details): info = self.amenitiesInfo("layout", house, details) money_score = info.money # check beds / baths if house.get("beds") is not None: extra_beds = house["beds"] - self.default_fields["layout"][ "beds_min"] money_score += extra_beds * self.default_fields["layout"][ "bed_bonus"] if house.get("baths") is not None: extra_baths = house["baths"] - self.default_fields["layout"][ "baths_min"] money_score += extra_baths * self.default_fields["layout"][ "bath_bonus"] return HouseScoreResult(money_score, info.value, info.accepted, info.message) def dom(self, house, details): median, div, cutoff, weight = self._get_measure_facts("history") now_epoch = time.time() * 1000.0 oldest_epoch = now_epoch - self.default_fields["history"][ "history_days"] * MS_IN_A_DAY house["dom_fixed"] = house["dom"] if house.get( "dom") is not None else 0.0 if details is not None: oldest_history_after_sale = now_epoch for event in details["payload"]["propertyHistoryInfo"]["events"]: if "sold" in event["eventDescription"].lower() or event[ "eventDate"] < oldest_epoch: # or event["historyEventType"] != 1 break # assuming events are sorted! event_epoc = event["eventDate"] if event_epoc < oldest_history_after_sale: #logging.debug("#1 oldest = {}, this event = {}".format(oldest_history_after_sale,event_epoc)) oldest_history_after_sale = event_epoc oldest_history_after_sale_days = ( now_epoch - oldest_history_after_sale) / MS_IN_A_DAY #logging.debug("dom={}, dom_fixed={}".format(house["dom"], oldest_history_after_sale_days)) house["dom_fixed"] = max(house["dom"], oldest_history_after_sale_days) if house.get("dom") is None: return HouseScoreResult(0.0, 0.0, False, "missing 'dom'") dom = house["dom_fixed"] money = (median - dom) / div * float(house["price"]) message = "House was on the market for {} days".format(int(dom)) if int(dom) != int(house["dom"]): message = message + " (reported {} days)".format(int(house["dom"])) if dom > cutoff: message = message + ", more than cut of {}".format(int(cutoff)) return HouseScoreResult(money, dom, (dom < cutoff), message) @staticmethod def AddIfNotExists(list_to_append, element): if element not in list_to_append: list_to_append.append(element) return list_to_append def history(self, house, details): na = HouseScoreResult(0, 0.0, True, "Not enough details") if details is None: return na if details.get("payload") is None or details.get("payload").get( "propertyHistoryInfo") is None: return na median, div, cutoff, weight = self._get_measure_facts("history") messages = [] dom_results = self.dom(house, details) messages.append(dom_results.message) epoch_days = time.time() * 1000.0 / MS_IN_A_DAY oldest_epoch_days = epoch_days - self.default_fields["history"][ "history_days"] oldest_epoch_days = max(house["dom_fixed"], oldest_epoch_days) total_penelty = 0.0 for event in details["payload"]["propertyHistoryInfo"]["events"]: event_epoc = event["eventDate"] event_epoc_days = event_epoc / MS_IN_A_DAY event_epoc_diff = int(epoch_days - event_epoc_days) event_str = json.dumps(event).lower() #logging.debug("#2 oldest = {}, this event = {}, diff = {}".format(oldest_epoch_days,event_epoc_days,event_epoc_diff)) if event_epoc_days < oldest_epoch_days: continue #logging.debug("event = {}".format(event_str)) if "inspection" in event_str: total_penelty -= self.default_fields["history"][ "inspection_penelty"] HouseScore.AddIfNotExists( messages, "was pending inspection {} days ago".format( event_epoc_diff)) elif "pending" in event_str: total_penelty -= self.default_fields["history"][ "pending_penelty"] HouseScore.AddIfNotExists( messages, "was pending {} days ago".format(event_epoc_diff)) elif event["eventDescription"].lower() in ["delisted", "relisted"]: total_penelty -= self.default_fields["history"][ "delisted_penelty"] HouseScore.AddIfNotExists( messages, "was relisted {} days ago".format(event_epoc_diff)) return HouseScoreResult(total_penelty + dom_results.money, dom_results.value, dom_results.accepted, ", ".join(messages)) def amenitiesInfo(self, key, house, details): na = HouseScoreResult(0, 0.0, True, "Not enough details") if details is None: return na if details.get("payload") is None or details.get("payload").get( "amenitiesInfo") is None: return na median, div, cutoff, weight = self._get_measure_facts(key) #logging.debug("amenities : detials = {}".format(details)) amenities_str = str(details["payload"]).lower() #logging.debug("amenities_str = {}".format(amenities_str)) #disable me required_amenities_sum = 0.0 amenities_score = 0.0 missing_amenities = [] for k in self.default_fields[key]["required"]: v = self.default_fields[key]["required"][k] required_amenities_sum = required_amenities_sum + v if k.lower() in amenities_str: amenities_score = amenities_score + v else: missing_amenities.append(k) for k in self.default_fields[key]["optional"]: if k.lower() in amenities_str: amenities_score = amenities_score + self.default_fields[key][ "optional"][k] else: missing_amenities.append(k) missing_amenities_message = "" if len( missing_amenities) == 0 else "missing : [ {} ]".format( ",".join(missing_amenities)) percentage = amenities_score / required_amenities_sum / div message = missing_amenities_message if percentage >= cutoff else " percentage {} is less than cut of {}, {}".format( percentage, cutoff, missing_amenities_message) return HouseScoreResult(amenities_score, percentage, (percentage >= cutoff), message) def distance(self, house, details): ret = 0.0 median, div, cutoff, weight = self._get_measure_facts("distance") fav_len = len(self.fav_locations) if fav_len == 0 or house.get("parcel") is None or house.get( "parcel").get("longitude") is None: house_address = RFAPI.house_address(house) if house_address != "": loc = AddressToLocation(house_address) if loc is not None and len(loc) == 2: house["parcel"] = {"latitude": loc[0], "longitude": loc[1]} return HouseScoreResult(cutoff * weight, cutoff, False, "Can't measure distance") for fav in self.fav_locations: dist = LocationDistance( [house["parcel"]["latitude"], house["parcel"]["longitude"]], fav["loc"]) diff = dist / div ret = ret + diff distance_average = ret / fav_len money = distance_average * weight message = "" if distance_average <= cutoff else "distance {}(mil) is larger than cut of {}(mil)".format( distance_average, cutoff) return HouseScoreResult(money, distance_average, (distance_average < cutoff), message) @staticmethod def get_house_score_message(m): scores = m['scores'] id_str = RFAPI.house_address(m) if m.get("URL") is not None: id_str = "{} : http://www.redfin.com{}".format(id_str, m["URL"]) if scores.get("cutoff") is not None: return False, "{} => Cut for {{ {} }} score = {}".format( id_str, scores["cutoff"], scores) else: return True, ("{} => {}".format(id_str, scores)) def post_process_one(self, m, search_name, get_details=False, force=False): house_details = None if get_details: house_details = self.rfapi.get_house_details(m, force=force, cache_time_format="") #logging.debug("post_process(get_details={})=>{}".format(get_details, house_details)) RFAPI.generate_url_for_house(m) ha = RFAPI.house_address_parts(m) scores = None try: scores = self.get_scores(m, house_details) except: logging.error("house : {}, throw {}".format( json.dumps(m), traceback.format_exc())) if scores is None: return m m['scores'] = scores is_good, message = HouseScore.get_house_score_message(m) if is_good: if house_details is None: house_details = self.rfapi.get_house_details( m, force=force, cache_time_format="") scores = self.get_scores(m, house_details) house_neighborhoods = RFAPI.house_neighborhoods(m, house_details) m['scores'] = scores is_good, message = HouseScore.get_house_score_message(m) logging.info(message) # getting city data takes a long time, will do it only for winning houses! """ self.city_data.get_data( house_address= ha['display'] , city=ha['city'] # Everett , state_short=ha['state'] , zip_code=ha['zip'] , house_neighborhoods=house_neighborhoods , force=force) """ else: logging.debug(message) return m def post_process(self, matches, search_name, get_details=False, force=False): good_ones = 0 with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor: executor.map( lambda m: self.post_process_one(m, search_name=search_name, get_details=get_details, force=force), matches) logging.debug("Done parallel processing") for m in tqdm(matches): # m = self.post_process_one(m, search_name=search_name, get_details=get_details, force=force) is_good, message = HouseScore.get_house_score_message(m) if is_good: good_ones = good_ones + 1 logging.debug("{} Matches={}/{}".format(search_name, good_ones, len(matches))) return matches @staticmethod def _plot_groups(X, y_pred): colors = np.array( list( islice( cycle([ '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ]), int(max(y_pred) + 1)))) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) plt.show() @staticmethod def ClusterHouses(matches, plot_groups=False): groups = {} try: N = len(matches) X = np.zeros((N, 2)) for m in range(N): loc = RFAPI.house_location(matches[m]) #logging.debug("ClusterHouses({})".format(loc)) X[m] = (loc[0], loc[1]) params = { 'quantile': .3, 'eps': .15, 'damping': .9, 'preference': -5, 'n_neighbors': 2, 'n_clusters': 5 } # a bit buggy.. spectral = cluster.SpectralClustering( n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") # best so far! gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') # yielded one cluster.. affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) algorithm = ms algorithm.fit(X) if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) for m in range(len(matches)): key = str(y_pred[m]) if groups.get(key, None) == None: groups[key] = [] groups[key].append({ "adress": RFAPI.house_address(matches[m]), "location": [X[m][0], X[m][1]] }) logging.debug("groups = {}".format(groups)) if plot_groups: HouseScore._plot_groups(X, y_pred) except Exception as e: groups["error"] = str(e) logging.error(groups["error"]) return groups @staticmethod def filter_good_houses(houses): return [m for m in houses if m['scores'].get("cutoff") is None] @staticmethod def add_html_tab(tab_id, tab_name, tab_content, tabs): is_first_tab = False tab_template = '<button class="tablinks {2}" onclick="openTab(event, \'{0}\')">{1}</button>' tab_content_template = '<div id="{0}" class="tabcontent">{1}</div>' if tabs is None: tabs = {"tabs": ['<div class="tab">', '</div>'], "contents": []} is_first_tab = True tabs['tabs'].insert( len(tabs['tabs']) - 1, tab_template.format(tab_id, tab_name, 'defaultOpen' if is_first_tab else "")) tabs['contents'].append( tab_content_template.format(tab_id, tab_content)) return tabs @staticmethod def get_house_summary(house, rank=0): score = house.get("scores") if score is None: logging.warning("score is missing for {}".format(house)) return {} first_listed = datetime.datetime.today() - datetime.timedelta( days=score["history"]["value"]) house_summary = { "Address": score["facts"]["full_address"], "score": score["value"]["value"], "distance": score["distance"]["value"], "County": score["facts"]["County"], "Year Build": score["facts"]["build"], "beds": score["facts"]["beds"], "baths": score["facts"]["baths"], "price": score["facts"]["price"], "sqft": score["facts"]["sqft"], "lot size": score["facts"]["sqft"] + score["backyard"]["value"], "first_listed": first_listed.strftime("%m/%d/%Y"), "dom": score["history"]["message"], "url": "http://www.redfin.com{}".format(house["URL"]), "picture": score["facts"]["photo"], "price_for_sf": score["facts"]["price"] / score["facts"]["sqft"] if score["facts"]["sqft"] > 0 else 0.0, "user_rank": rank } return house_summary @staticmethod def get_houses_category_html(houses, summary="", category_id=""): html_content = [] #logging.debug("houses {}".format(len(houses))) even_raw = False house_id = 0 for house in houses: house_id += 1 even_raw = not even_raw score = house.get("scores") if score is None: logging.warning("score is missing for {}".format(house)) continue house_summary_raw = HouseScore.get_house_summary(house) house_summary = { "Address": house_summary_raw["Address"], "score": "{:.2f}".format(house_summary_raw["score"]), "distance": "{:.2f}".format(house_summary_raw["distance"]), "County": house_summary_raw["County"], "Year Build": house_summary_raw["Year Build"], "beds": house_summary_raw["beds"], "baths": house_summary_raw["baths"], "price": house_summary_raw["price"], "sqft": house_summary_raw["sqft"], "lot size": house_summary_raw["lot size"], "dom": house_summary_raw["dom"], "$/sf": "{:.0f}".format(house_summary_raw["price_for_sf"]) } is_good, message = HouseScore.get_house_score_message(house) if is_good: pass_message = "Good ( score = {:.2f}, distance = {:.2f} )".format( score["value"]["value"], score["distance"]["value"]) else: house_summary["Failed"] = "[ {} ]".format(score["cutoff"]) pass_message = "Failed [ {} ] ( score = {:.2f}, distance = {:.2f} )".format( score["cutoff"], score["value"]["value"], score["distance"]["value"]) house_summary_html = DicToTHML(house_summary) score_html = DicToTHML(score) # build taps tabs = None house_group_id = "{}_{}".format(category_id, house_id) tabs = HouseScore.add_html_tab( tab_id="summary_{}".format(house_group_id), tab_name="Summary", tab_content=house_summary_html, tabs=tabs) tabs = HouseScore.add_html_tab( tab_id="details_{}".format(house_group_id), tab_name="Details", tab_content=score_html, tabs=tabs) maps_url = "https://www.google.com/maps/place/{}".format( score["facts"]["full_address"].replace(' ', '+')) areavibes_url = "https://www.areavibes.com/{}-{}/livability/".format( house['address_data']['city'].replace('-', '+').replace(' ', '+'), house['address_data']['state']) spotcrime_url = "https://spotcrime.com/#{}".format( score["facts"]["full_address"].replace(' ', '%20').replace( '-', '%20').replace(',', '%2C')) tabs = HouseScore.add_html_tab( tab_id="links_{}".format(house_group_id), tab_name="Links", tab_content=""" <H3><A href="{}">Map</A></H3> <H3><A href="{}">Areavibes</A></H3> <H3><A href="{}">SpotCrime</A></H3> """.format(maps_url, areavibes_url, spotcrime_url), tabs=tabs) # #<IFRAME width='100%' height='500' src='https://spotcrime.com/"+spotcrime_sub_path+"'/>" tabs_html = "\n".join(tabs['tabs'] + tabs['contents']) this_house_report = r""" <TR {4}> <!-- draggable="true" //--> <TD align="center" valign="top" > <TABLE width="100%"> <TR><TD colspan="2" width="100%"> <H2><A href="http://www.redfin.com{1}">{2}</A></H2> </TD></TR> <TR><TD width="50%" align="center" valign="top" > <A href="http://www.redfin.com{1}"><IMG width="100%" src="{0}" /></A><BR/> <P>Details : {5}</P> </TD><TD align="left" valign="top" width="50%"> {3} </TD></TR> </TABLE> </TD> </TR> """.format( score["facts"]["photo"] #0 , house["URL"] #1 , score["facts"]["full_address"] #2 , tabs_html #3 , ('class="one-house-dragable page-break"' if even_raw else 'class="one-house-dragable no-page-break"') #4 , pass_message) html_content.append(this_house_report) if len(html_content) == 0: return "" category_template = """ <H1>{$SUMMARY}</H1> <BR/> <TABLE width="90%" align="center" valign="top"> {$ACCORDION_TEMPLATE_BODY} </TABLE> """ category_template = category_template.replace("{$SUMMARY}", summary) category_template = category_template.replace( "{$ACCORDION_TEMPLATE_BODY}", "\n".join(html_content)) return category_template @staticmethod def search_for_range(matches, active_only=True, min_price=0, max_price=7600000, only_good=True): good_houses = matches if active_only: good_houses = [ m for m in good_houses if m.get('status') is None or m['status'] == 'Active' ] if only_good: good_houses = [ m for m in good_houses if m['scores'].get("cutoff") is None ] good_houses = [ m for m in good_houses if m['price'] >= min_price and m['price'] <= max_price ] sorted_good_houses = HouseScore.sort_by_total_score_vs_price( good_houses) return sorted_good_houses @staticmethod def get_houses_html(houses, title="", active_only=True, only_good=True): tabs = None filtered_matches = HouseScore.search_for_range(houses, active_only=active_only, min_price=490000, max_price=760000, only_good=only_good) tab_content = HouseScore.get_houses_category_html( filtered_matches, category_id="full", summary="Found ( {} / {} ) good houses".format( len(filtered_matches), len(houses))) tabs = HouseScore.add_html_tab(tab_id="full_id", tab_name="All Houses", tab_content=tab_content, tabs=tabs) filtered_matches = HouseScore.search_for_range(houses, active_only=active_only, min_price=490000, max_price=610000, only_good=only_good) tab_content = HouseScore.get_houses_category_html( filtered_matches, category_id="low", summary="Found ( {} / {} ) good houses".format( len(filtered_matches), len(houses))) tabs = HouseScore.add_html_tab(tab_id="low_id", tab_name="Low", tab_content=tab_content, tabs=tabs) filtered_matches = HouseScore.search_for_range(houses, active_only=active_only, min_price=590000, max_price=710000, only_good=only_good) tab_content = HouseScore.get_houses_category_html( filtered_matches, category_id="med", summary="Found ( {} / {} ) good houses".format( len(filtered_matches), len(houses))) tabs = HouseScore.add_html_tab(tab_id="med_id", tab_name="Medium", tab_content=tab_content, tabs=tabs) filtered_matches = HouseScore.search_for_range(houses, active_only=active_only, min_price=690000, max_price=760000, only_good=only_good) tab_content = HouseScore.get_houses_category_html( filtered_matches, category_id="high", summary="Found ( {} / {} ) good houses".format( len(filtered_matches), len(houses))) tabs = HouseScore.add_html_tab(tab_id="high_id", tab_name="High", tab_content=tab_content, tabs=tabs) with open(os.path.join(SCRIPT_DIR, "report_template.html"), "r") as html_template_stream: html_template = html_template_stream.read() html_template = html_template.replace("{$TITLE}", title) html_template = html_template.replace("{$TABS}", "\n".join(tabs['tabs'])) html_template = html_template.replace("{$TAB_CONTENTS}", "\n".join(tabs['contents'])) return html_template @staticmethod def sort_by_total_score_vs_price(good_houses): cost_gain = {} for m in good_houses: gain = m['scores']['value']["money"] - m['scores']['facts']['price'] gain_percentage = m['scores']['value']["value"] if cost_gain.get(gain_percentage) is None: cost_gain[gain_percentage] = [] cost_gain[gain_percentage].append(m) sorted_keys = sorted(cost_gain, reverse=True) #logging.debug(sorted_keys) retVal = [] for k in sorted_keys: for m in cost_gain[k]: #m['scores']['gain_percentage'] = k retVal.append(m) return retVal def SearchByUrl(self, house_url, get_details=True, force=False, cache_time_format="%Y%m%d"): house, details = self.rfapi.get_house_by_url( house_url, force=force, cache_time_format=cache_time_format) return self.post_process([house], "SearchByUrl({})".format(house_url), get_details=get_details, force=force) def Search(self, search_name, search_json, get_details=False, force=False): logging.debug( "Search(search_name={}, search_json={}, get_details={}, force={})". format(search_name, search_json, get_details, force)) matches = self.rfapi.retrieve_json(search_json, force=force) return self.post_process(matches, search_name, get_details=get_details, force=force) def SearchForZIPCodes(self, search_name, search_json, zip_codes, get_details=False, force=False): zip_regions = [ self.rfapi.zipcode_to_regionid(zipcode, False) for zipcode in zip_codes ] # need to convert to region_id region_types = [2 for zipcode in zip_codes] all_matches = [] for region in zip_regions: search_json["region_id"] = [region] matches = self.rfapi.retrieve_json(search_json, force=force) all_matches += matches return self.post_process(all_matches, search_name, get_details=get_details, force=force) @staticmethod def LoadFavorits(FavoriteLocationsFile): with open(FavoriteLocationsFile, "r") as stream: fav = json.load(stream) ret = [] for v in fav: if v['importance'] != 0: ret.append(v) return ret
{ "not": { "ip": "127.0.0.1" } }, { "not": { "ip": "0.0.0.0" } } ], "limit": 10 }, "output": { "exclude": [ "stats" ], "inline_entities": True } } # Using RFAPI module, run query # Note: To pull back all results, use rfqapi.paged_query(q) # and a higher limit. rfqapi = RFAPI(token) result = rfqapi.query(q) # Display the results (in this case, limit is 1) for res in result['events']: print "Event: \n" print str(res) + '\n'
class IOCEnricher(object): '''Enriches a list of IOCs with data from Recorded Future. ''' _VALID_TYPES = ["IpAddress", "Hash", "InternetDomainName"] _INSTANCES_OR_DOCUMENTS = 'instances' _MALICIOUS_INDICATORS = [ "compromised", "malicious", "suspected", "threat", "malware", "infected", "honeypot", "attacked from", "exploit", "attacks from", "bad http request from", "attack detected", "attack deteted" ] _RELATED_ENTITY_TYPES = [ 'Malware', 'CyberVulnerability', 'IpAddress', 'Hash', 'InternetDomainName' ] # can be "document" also, but enrichment will take much longer # to pull document-level co-entities. fragment-level will use # extended_entities where available _RELATED_ENTITY_SCOPE = "fragment" _FEATURES = { "debug": collections.OrderedDict([("RFID", ""), ("EntityType", ""), ("TotalHits", 0), ("7DayHits", 0), ("1DayHits", 0), ("MaliciousHits", 0), ("InfoSecHits", 0), ("PasteHits", 0), ("SocialMediaHits", 0)]), "related": collections.OrderedDict([("RelatedMalware", []), ("RelatedCyberVulnerability", []), ("RelatedIpAddress", []), ("RelatedInternetDomainName", []), ("RelatedHash", []), ("RelatedMalwareCount", 0), ("RelatedCyberVulnerabilityCount", 0), ("RelatedIpAddressCount", 0), ("RelatedInternetDomainNameCount", 0), ("RelatedHashCount", 0), ("Score", 0.0)]), "core": collections.OrderedDict([("Name", ""), ("RFURL", ""), ("MostRecent", ""), ("MostRecentSource", ""), ("MostRecentTitle", ""), ("MostRecentFragment", ""), ("MostRecentURL", ""), ("RecentInfoSecSource", ""), ("RecentInfoSecTitle", ""), ("RecentInfoSecFragment", ""), ("RecentInfoSecURL", ""), ("RecentPasteSource", ""), ("RecentPasteTitle", ""), ("RecentPasteFragment", ""), ("RecentPasteURL", ""), ("RecentSocialMediaSource", ""), ("RecentSocialMediaTitle", ""), ("RecentSocialMediaFragment", ""), ("RecentSocialMediaURL", ""), ("FirstSource", ""), ("FirstTitle", ""), ("FirstFragment", ""), ("FirstURL", ""), ("FirstPublished", "")]) } def __init__(self, token, iocs, entity_type, mode='core'): ''' Parameters ---------- token : str Recorded Future API token iocs : list or dict List of IOCs to enrich or dict of IOCs keyed by name with the value as the RFID. entity_type : {"IpAddress", "Hash", "InternetDomainName"} Name of Recorded Future entity type for IOC. mode : {"core", "related", "debug"} Subset of features to return with enrichment. "core" is default. ''' self.rfqapi = RFAPI(token) self.response = collections.OrderedDict() # need all features early for scoring; they're removed later # need to test whether this can be avoided keys = self._FEATURES['core'] keys.update(self._FEATURES['debug']) if mode in ('related', 'debug'): keys.update(self._FEATURES['related']) if mode not in ('core', 'related', 'debug'): raise ValueError( '"mode" must be one of ("core", "related", "debug"). Input: %s.' % mode) self.mode = mode self.entity_type = entity_type if isinstance(iocs, list): self.iocs = self._get_rfids(iocs) elif isinstance(iocs, dict): self.iocs = iocs else: raise ValueError('"iocs" must be list or dict.') for ioc in self.iocs: new_resp = {} for key in keys: new_resp[key] = keys[key] if key == 'Name': new_resp[key] = ioc elif key == 'RFID': new_resp[key] = self.iocs[ioc] elif key == 'EntityType': new_resp[key] = self.entity_type self.response[ioc] = new_resp self.keys = keys def get_keys(self, mode=None): '''Getter for the keys in the response. ''' return [ key for key in self.keys if key not in self._get_extra_features(mode) ] def _get_extra_features(self, mode=None): if not mode: mode = self.mode extra_features = [] if mode in ('core', 'related'): extra_features = self._FEATURES['debug'].keys() return extra_features def enrich(self): '''Enriches the given IOC. Returns ------- response : dict The enrichment package containing all keys requested by "mode" parameter. ''' print " Getting all references" max_index = None for names in _chunks(self.iocs.keys(), 250): refs, edetails = self._get_all_references(names) print " Getting enrichment from references" max_index_cand = self._get_enrichment(refs, edetails) if max_index_cand < max_index or not max_index: # using < here because the references are no longer retrieved all from # the same query, so there may be timings, so we're looking at the minimax max_index = max_index_cand print " Getting URL and Score" for ioc in self.response: ioc_resp = self.response[ioc] # Get RF URL if 'RFURL' in ioc_resp: ioc_resp['RFURL'] = _generate_rfURL_from_entity( ioc, ioc_resp.get('RFID', None)) # Score the ref if 'Score' in ioc_resp: self.score(ioc_resp) # Remove unnecessary features extra_features = self._get_extra_features() for key in extra_features: del ioc_resp[key] return self.response, max_index def score(self, ioc_resp): spec_keys = ('7DayHits', '1DayHits') nonzero_keys = ('MaliciousHits', 'InfoSecHits', 'PasteHits', 'RelatedMalwareCount', 'RelatedCyberVulnerabilityCount', 'RelatedIpAddressCount', 'RelatedInternetDomainNameCount', 'RelatedHashCount') max_score = 0.0 # score special keys if 'TotalHits' in self.keys: for key in filter(lambda k: k in self.keys, spec_keys): if ((ioc_resp[key] * 2) > ioc_resp["TotalHits"]): ioc_resp['Score'] += 1 max_score += len(spec_keys) # score nonzero keys for key in filter(lambda k: k in self.keys, nonzero_keys): if ioc_resp[key] > 0: ioc_resp['Score'] += 1 max_score += 1 ioc_resp['Score'] = ioc_resp['Score'] / max_score def _get_enrichment(self, refs, edetails): max_index = None today = datetime.datetime.today() one_day_hit_string = _rfid_date_conv(today - datetime.timedelta(days=1)) seven_day_hit_string = _rfid_date_conv(today - datetime.timedelta(days=7)) # first get everything from all references print " Processing references" ioc_to_rfid = self.iocs rfid_to_ioc = {} for ioc in filter(lambda i: ioc_to_rfid[i], ioc_to_rfid): rfid_to_ioc[ioc_to_rfid[ioc]] = ioc recent_pub = { "MostRecent": {}, "Paste": {}, "InfoSec": {}, "SocialMedia": {} } first_pub = {} for ref in refs: indexed = ref['document']['indexed'] if indexed > max_index or not max_index: max_index = indexed fragment = ref['fragment'].lower() attrs = ref['attributes'] source_topic = ref['document']['sourceId'].get('topic', None) source_media_type = ref['document']['sourceId'].get( 'media_type', None) pub_date = ref['document']['published'] # get entities mentioned rfids = filter(lambda ioc: ioc in rfid_to_ioc, attrs.get('entities', [])) ioc_rfids = [rfid for rfid in rfids if rfid in rfid_to_ioc] # get string hits that aren't included in the entity hits other_hits = [ ioc for ioc in ioc_to_rfid if (ioc in fragment and ioc_to_rfid[ioc] not in ioc_rfids) ] # increment hit counts and get recent hits iocs = [rfid_to_ioc[rfid] for rfid in ioc_rfids] for ioc in iocs + other_hits: ioc_resp = self.response[ioc] # update dates recent_pub['MostRecent'][ioc] = self._safe_update_date( ioc_resp, pub_date, recent_pub['MostRecent'][ioc] if ioc in recent_pub['MostRecent'] else '', 'MostRecent', pub_date > recent_pub['MostRecent'][ioc] if ioc in recent_pub['MostRecent'] and len(recent_pub['MostRecent'][ioc]) > 0 else True) first_pub[ioc] = self._safe_update_date( ioc_resp, pub_date, first_pub[ioc] if ioc in first_pub else '', 'FirstPublished', pub_date < first_pub[ioc] if ioc in first_pub and len(first_pub[ioc]) > 0 else True) # update hit counters self._safe_update_hits(ioc_resp, 'TotalHits', True) self._safe_update_hits(ioc_resp, '1DayHits', pub_date >= one_day_hit_string) self._safe_update_hits(ioc_resp, '7DayHits', pub_date >= seven_day_hit_string) self._safe_update_hits( ioc_resp, 'MaliciousHits', any(term in fragment for term in self._MALICIOUS_INDICATORS)) # update hit counters and references conditions = { "InfoSec": source_topic == 'KPzZAE', "Paste": source_media_type == 'KDS1Zp', "SocialMedia": source_media_type == 'JxSEtC' } for key in conditions: condition = conditions[key] recent_pub[key][ioc] = self._safe_update_hits_and_refs( ioc_resp, ref, key, condition, recent_pub[key][ioc] if ioc in recent_pub[key] else '', pub_date > recent_pub[key][ioc] if ioc in recent_pub[key] and len(recent_pub[key][ioc]) > 0 else True) # update references for first and recent self._safe_update_refs( ioc_resp, ref, 'MostRecent', pub_date == recent_pub['MostRecent'][ioc]) self._safe_update_refs(ioc_resp, ref, 'First', pub_date == first_pub[ioc]) # get related content at fragment scope if self.mode in ('debug', 'related' ) and self._RELATED_ENTITY_SCOPE == 'fragment': self._safe_get_related_entities_from_frags(refs, edetails) # get related content at document scope if self.mode in ('debug', 'related' ) and self._RELATED_ENTITY_SCOPE == 'document': # print "Getting related content from documents" docs = self._get_docs() self._safe_get_related_entities_from_docs(docs) return max_index def _safe_update_hits_and_refs(self, ioc_resp, ref, key, condition, cur_date, date_condition): pub_date = ref['document']['published'] date_update = self._safe_update_date(ioc_resp, pub_date, cur_date, key, date_condition and condition) if condition: # update hits self._safe_update_hits(ioc_resp, key + 'Hits', condition) # get recent frags self._safe_update_refs(ioc_resp, ref, 'Recent' + key, pub_date == date_update) return date_update def _safe_update_date(self, ioc_resp, date, existing_val, key, condition): if condition and key in ioc_resp: ioc_resp[key] = date return date if condition else existing_val def _safe_update_hits(self, ioc_resp, key, condition): if condition and key in ioc_resp: ioc_resp[key] += 1 def _safe_update_refs(self, ioc_resp, ref, key, condition): if condition: key_suffixes = { 'Source': ref['document']['sourceId']['name'].replace('\n', ' ').replace( '\r', ' '), 'Title': ref['document']['title'].replace('\n', ' ').replace('\r', ' '), 'Fragment': ref['fragment'].replace('\n', ' ').replace('\r', ' '), 'URL': ref['document']['url'] if 'url' in ref['document'] else '' } for suffix in filter(lambda suf: key + suf in ioc_resp, key_suffixes): ioc_resp[key + suffix] = key_suffixes[suffix] def _get_all_references(self, names): refs = [] seen_ids = set() edetails = {} q = { "instance": { "type": "Event", "limit": 25000, "searchtype": "scan" } } q['instance']['attributes'] = [[{ "name": "Event.event_fragment", 'string': names }]] rfids = [self.iocs[name] for name in names if self.iocs[name]] q['instance']['attributes'][0].append({ "name": "entities", "entity": { "id": rfids } }) # print len(self.iocs.keys()), for res in self.rfqapi.paged_query(q): refs.extend([ inst for inst in res['instances'] if inst['id'] not in seen_ids ]) seen_ids.update([inst['id'] for inst in res['instances']]) edetails.update({ eid: res['entities'][eid] for eid in res['entities'] if res['entities'][eid]['type'] in self._RELATED_ENTITY_TYPES }) return refs, edetails def _get_docs(self): all_docs = set() for names in _chunks(self.iocs.keys(), 250): q = { "instance": { "type": "Event" }, "output": { "count": { "axis": [{ "name": "attributes.entities", "type": [self.entity_type], "aspect": "name" }, "document"], "values": [self._INSTANCES_OR_DOCUMENTS] } } } q['instance']['attributes'] = [[{ "name": "Event.event_fragment", 'string': names }]] rfids = [self.iocs[name] for name in names if self.iocs[name]] q['instance']['attributes'][0].append({ "name": "entities", "entity": { "id": rfids } }) res = self.rfqapi.query(q) counts = res["counts"][0] if len(counts) != 0: for ioc in filter(lambda i: i in self.iocs, counts): docids = counts[ioc].keys() self.response[ioc]['DocumentIds'] = docids all_docs.update(docids) return list(all_docs) def _safe_get_related_entities_from_frags(self, refs, edetails): ioc_to_rfid = self.iocs rfid_to_ioc = {} for ioc in filter(lambda i: ioc_to_rfid[i], ioc_to_rfid): rfid_to_ioc[ioc_to_rfid[ioc]] = ioc entities_to_lookup = set() for ref in refs: related_ents = ref['attributes'].get( 'extended_entities', ref['attributes'].get('entities', [])) entities_to_lookup.update( [eid for eid in related_ents if eid not in edetails]) # print "Updating entity resolution" edetails.update( self._resolve_related_entities(list(entities_to_lookup))) # print "Updated related entities" for ref in refs: fragment = ref['fragment'].lower() # get related entities from reference related_ents = ref['attributes'].get( 'extended_entities', ref['attributes'].get('entities', [])) # get entities mentioned rfids = filter(lambda ioc: ioc in rfid_to_ioc, related_ents) ioc_rfids = [rfid for rfid in rfids if rfid in rfid_to_ioc] # get string hits that aren't included in the entity hits other_hits = [ ioc for ioc in ioc_to_rfid if (ioc in fragment and ioc_to_rfid[ioc] not in ioc_rfids) ] iocs = [rfid_to_ioc[rfid] for rfid in ioc_rfids] for ioc in iocs + other_hits: ioc_resp = self.response[ioc] for ent in filter( lambda eid: eid in edetails and eid != ioc_resp[ 'RFID'], related_ents): etype, name = edetails[ent]['type'], edetails[ent]['name'] if name not in ioc_resp['Related' + etype]: ioc_resp['Related' + etype].append(name) for ioc in self.response: ioc_resp = self.response[ioc] for etype in self._RELATED_ENTITY_TYPES: if 'Related' + etype + 'Count' in ioc_resp: ioc_resp['Related' + etype + 'Count'] = len( ioc_resp['Related' + etype]) if 'Related' + etype not in self.keys and 'Related' + etype in ioc_resp: del ioc_resp['Related' + etype] def _resolve_related_entities(self, eids): if len(eids) == 0: return {} results = {} for ents in _chunks(eids, 250): q = {"entity": {"id": ents, "limit": 1001}} res = self.rfqapi.query(q) results.update({ eid: res['entity_details'][eid] for eid in res['entity_details'] if res['entity_details'][eid] ['type'] in self._RELATED_ENTITY_TYPES }) return results def _safe_get_related_entities_from_docs(self, docs): for docids in _chunks(docs, 250): q = { "instance": { "type": "Event", "document": { "id": docids } }, "output": { "count": { "axis": [ "document", { "name": "attributes.entities", "type": self._RELATED_ENTITY_TYPES, "aspect": "all" } ], "values": [self._INSTANCES_OR_DOCUMENTS] } } } res = self.rfqapi.query(q) counts = res['counts'][0] for ioc in self.response: ioc_resp = self.response[ioc] for docid in filter(lambda did: did in counts, ioc_resp['DocumentIds']): for asp_name in filter(lambda n: n != 'NONE', counts[docid]): name, unused, etype = rf_agg_name_parser(asp_name) if name == ioc: continue # update related counts if name not in ioc_resp['Related' + etype]: ioc_resp['Related' + etype].append(name) for ioc in self.response: ioc_resp = self.response[ioc] if 'DocumentIds' not in self.keys and 'DocumentIds' in ioc_resp: del ioc_resp['DocumentIds'] for etype in self._RELATED_ENTITY_TYPES: if 'Related' + etype + 'Count' in ioc_resp: ioc_resp['Related' + etype + 'Count'] = len( ioc_resp['Related' + etype]) if 'Related' + etype not in self.keys and 'Related' + etype in ioc_resp: del ioc_resp['Related' + etype] def _get_rfids(self, iocs): new_iocs = collections.OrderedDict() edetails = {} for names in _chunks(iocs, 250): if len(names) == 0: continue q = { "entity": { "name": names, "type": self.entity_type, "limit": 501 } } res = self.rfqapi.query(q) if len(res['entities']) == 0: continue for ent in res['entities']: edetails[res['entity_details'][ent]['name']] = ent for ioc in iocs: new_iocs[ioc] = edetails[ioc] if ioc in edetails else None return new_iocs
class IOCEnricher(object): '''Enriches a list of IOCs with data from Recorded Future. ''' _VALID_TYPES = ["IpAddress", "Hash", "InternetDomainName"] _INSTANCES_OR_DOCUMENTS = 'instances' _MALICIOUS_INDICATORS = ["compromised", "malicious", "suspected", "threat", "malware", "infected", "honeypot", "attacked from", "exploit", "attacks from", "bad http request from", "attack detected", "attack deteted"] _RELATED_ENTITY_TYPES = ['Malware', 'CyberVulnerability', 'IpAddress', 'Hash', 'InternetDomainName'] # can be "document" also, but enrichment will take much longer # to pull document-level co-entities. fragment-level will use # extended_entities where available _RELATED_ENTITY_SCOPE = "fragment" _FEATURES = {"debug": collections.OrderedDict([("RFID", ""), ("EntityType", ""), ("TotalHits", 0), ("7DayHits", 0), ("1DayHits", 0), ("MaliciousHits", 0), ("InfoSecHits", 0), ("PasteHits", 0), ("SocialMediaHits", 0)]), "related": collections.OrderedDict([("RelatedMalware", []), ("RelatedCyberVulnerability", []), ("RelatedIpAddress", []), ("RelatedInternetDomainName", []), ("RelatedHash", []), ("RelatedMalwareCount", 0), ("RelatedCyberVulnerabilityCount", 0), ("RelatedIpAddressCount", 0), ("RelatedInternetDomainNameCount", 0), ("RelatedHashCount", 0), ("Score", 0.0)]), "core": collections.OrderedDict([("Name", ""), ("RFURL", ""), ("MostRecent", ""), ("MostRecentSource", ""), ("MostRecentTitle", ""), ("MostRecentFragment", ""), ("MostRecentURL", ""), ("RecentInfoSecSource", ""), ("RecentInfoSecTitle", ""), ("RecentInfoSecFragment", ""), ("RecentInfoSecURL", ""), ("RecentPasteSource", ""), ("RecentPasteTitle", ""), ("RecentPasteFragment", ""), ("RecentPasteURL", ""), ("RecentSocialMediaSource", ""), ("RecentSocialMediaTitle", ""), ("RecentSocialMediaFragment", ""), ("RecentSocialMediaURL", ""), ("FirstSource", ""), ("FirstTitle", ""), ("FirstFragment", ""), ("FirstURL", ""), ("FirstPublished", "")])} def __init__(self, token, iocs, entity_type, mode='core'): ''' Parameters ---------- token : str Recorded Future API token iocs : list or dict List of IOCs to enrich or dict of IOCs keyed by name with the value as the RFID. entity_type : {"IpAddress", "Hash", "InternetDomainName"} Name of Recorded Future entity type for IOC. mode : {"core", "related", "debug"} Subset of features to return with enrichment. "core" is default. ''' self.rfqapi = RFAPI(token) self.response = collections.OrderedDict() # need all features early for scoring; they're removed later # need to test whether this can be avoided keys = self._FEATURES['core'] keys.update(self._FEATURES['debug']) if mode in ('related', 'debug'): keys.update(self._FEATURES['related']) if mode not in ('core', 'related', 'debug'): raise ValueError('"mode" must be one of ("core", "related", "debug"). Input: %s.' % mode) self.mode = mode self.entity_type = entity_type if isinstance(iocs, list): self.iocs = self._get_rfids(iocs) elif isinstance(iocs, dict): self.iocs = iocs else: raise ValueError('"iocs" must be list or dict.') for ioc in self.iocs: new_resp = {} for key in keys: new_resp[key] = keys[key] if key == 'Name': new_resp[key] = ioc elif key == 'RFID': new_resp[key] = self.iocs[ioc] elif key == 'EntityType': new_resp[key] = self.entity_type self.response[ioc] = new_resp self.keys = keys def get_keys(self, mode=None): '''Getter for the keys in the response. ''' return [key for key in self.keys if key not in self._get_extra_features(mode)] def _get_extra_features(self, mode=None): if not mode: mode = self.mode extra_features = [] if mode in ('core', 'related'): extra_features = self._FEATURES['debug'].keys() return extra_features def enrich(self): '''Enriches the given IOC. Returns ------- response : dict The enrichment package containing all keys requested by "mode" parameter. ''' print " Getting all references" max_index = None for names in _chunks(self.iocs.keys(), 250): refs, edetails = self._get_all_references(names) print " Getting enrichment from references" max_index_cand = self._get_enrichment(refs, edetails) if max_index_cand < max_index or not max_index: # using < here because the references are no longer retrieved all from # the same query, so there may be timings, so we're looking at the minimax max_index = max_index_cand print " Getting URL and Score" for ioc in self.response: ioc_resp = self.response[ioc] # Get RF URL if 'RFURL' in ioc_resp: ioc_resp['RFURL'] = _generate_rfURL_from_entity(ioc, ioc_resp.get('RFID', None)) # Score the ref if 'Score' in ioc_resp: self.score(ioc_resp) # Remove unnecessary features extra_features = self._get_extra_features() for key in extra_features: del ioc_resp[key] return self.response, max_index def score(self, ioc_resp): spec_keys = ('7DayHits', '1DayHits') nonzero_keys = ('MaliciousHits', 'InfoSecHits', 'PasteHits', 'RelatedMalwareCount', 'RelatedCyberVulnerabilityCount', 'RelatedIpAddressCount', 'RelatedInternetDomainNameCount', 'RelatedHashCount') max_score = 0.0 # score special keys if 'TotalHits' in self.keys: for key in filter(lambda k: k in self.keys, spec_keys): if ((ioc_resp[key]*2) > ioc_resp["TotalHits"]): ioc_resp['Score'] += 1 max_score += len(spec_keys) # score nonzero keys for key in filter(lambda k: k in self.keys, nonzero_keys): if ioc_resp[key] > 0: ioc_resp['Score'] += 1 max_score += 1 ioc_resp['Score'] = ioc_resp['Score'] / max_score def _get_enrichment(self, refs, edetails): max_index = None today = datetime.datetime.today() one_day_hit_string = _rfid_date_conv(today - datetime.timedelta(days=1)) seven_day_hit_string = _rfid_date_conv(today - datetime.timedelta(days=7)) # first get everything from all references print " Processing references" ioc_to_rfid = self.iocs rfid_to_ioc = {} for ioc in filter(lambda i: ioc_to_rfid[i], ioc_to_rfid): rfid_to_ioc[ioc_to_rfid[ioc]] = ioc recent_pub = {"MostRecent": {}, "Paste": {}, "InfoSec": {}, "SocialMedia": {}} first_pub = {} for ref in refs: indexed = ref['document']['indexed'] if indexed > max_index or not max_index: max_index = indexed fragment = ref['fragment'].lower() attrs = ref['attributes'] source_topic = ref['document']['sourceId'].get('topic', None) source_media_type = ref['document']['sourceId'].get('media_type', None) pub_date = ref['document']['published'] # get entities mentioned rfids = filter(lambda ioc: ioc in rfid_to_ioc, attrs.get('entities', [])) ioc_rfids = [rfid for rfid in rfids if rfid in rfid_to_ioc] # get string hits that aren't included in the entity hits other_hits = [ioc for ioc in ioc_to_rfid if (ioc in fragment and ioc_to_rfid[ioc] not in ioc_rfids)] # increment hit counts and get recent hits iocs = [rfid_to_ioc[rfid] for rfid in ioc_rfids] for ioc in iocs + other_hits: ioc_resp = self.response[ioc] # update dates recent_pub['MostRecent'][ioc] = self._safe_update_date(ioc_resp, pub_date, recent_pub['MostRecent'][ioc] if ioc in recent_pub['MostRecent'] else '', 'MostRecent', pub_date > recent_pub['MostRecent'][ioc] if ioc in recent_pub['MostRecent'] and len(recent_pub['MostRecent'][ioc]) > 0 else True) first_pub[ioc] = self._safe_update_date(ioc_resp, pub_date, first_pub[ioc] if ioc in first_pub else '', 'FirstPublished', pub_date < first_pub[ioc] if ioc in first_pub and len(first_pub[ioc]) > 0 else True) # update hit counters self._safe_update_hits(ioc_resp, 'TotalHits', True) self._safe_update_hits(ioc_resp, '1DayHits', pub_date >= one_day_hit_string) self._safe_update_hits(ioc_resp, '7DayHits', pub_date >= seven_day_hit_string) self._safe_update_hits(ioc_resp, 'MaliciousHits', any(term in fragment for term in self._MALICIOUS_INDICATORS)) # update hit counters and references conditions = {"InfoSec": source_topic == 'KPzZAE', "Paste": source_media_type == 'KDS1Zp', "SocialMedia": source_media_type == 'JxSEtC'} for key in conditions: condition = conditions[key] recent_pub[key][ioc] = self._safe_update_hits_and_refs(ioc_resp, ref, key, condition, recent_pub[key][ioc] if ioc in recent_pub[key] else '', pub_date > recent_pub[key][ioc] if ioc in recent_pub[key] and len(recent_pub[key][ioc]) > 0 else True) # update references for first and recent self._safe_update_refs(ioc_resp, ref, 'MostRecent', pub_date == recent_pub['MostRecent'][ioc]) self._safe_update_refs(ioc_resp, ref, 'First', pub_date == first_pub[ioc]) # get related content at fragment scope if self.mode in ('debug', 'related') and self._RELATED_ENTITY_SCOPE == 'fragment': self._safe_get_related_entities_from_frags(refs, edetails) # get related content at document scope if self.mode in ('debug', 'related') and self._RELATED_ENTITY_SCOPE == 'document': # print "Getting related content from documents" docs = self._get_docs() self._safe_get_related_entities_from_docs(docs) return max_index def _safe_update_hits_and_refs(self, ioc_resp, ref, key, condition, cur_date, date_condition): pub_date = ref['document']['published'] date_update = self._safe_update_date(ioc_resp, pub_date, cur_date, key, date_condition and condition) if condition: # update hits self._safe_update_hits(ioc_resp, key + 'Hits', condition) # get recent frags self._safe_update_refs(ioc_resp, ref, 'Recent' + key, pub_date == date_update) return date_update def _safe_update_date(self, ioc_resp, date, existing_val, key, condition): if condition and key in ioc_resp: ioc_resp[key] = date return date if condition else existing_val def _safe_update_hits(self, ioc_resp, key, condition): if condition and key in ioc_resp: ioc_resp[key] += 1 def _safe_update_refs(self, ioc_resp, ref, key, condition): if condition: key_suffixes = {'Source': ref['document']['sourceId']['name'].replace('\n', ' ').replace('\r', ' '), 'Title': ref['document']['title'].replace('\n', ' ').replace('\r', ' '), 'Fragment': ref['fragment'].replace('\n', ' ').replace('\r', ' '), 'URL': ref['document']['url'] if 'url' in ref['document'] else ''} for suffix in filter(lambda suf: key + suf in ioc_resp, key_suffixes): ioc_resp[key + suffix] = key_suffixes[suffix] def _get_all_references(self, names): refs = [] seen_ids = set() edetails = {} q = {"instance": {"type": "Event", "limit": 25000, "searchtype": "scan"}} q['instance']['attributes'] = [[{"name": "Event.event_fragment", 'string': names}]] rfids = [self.iocs[name] for name in names if self.iocs[name]] q['instance']['attributes'][0].append({"name": "entities", "entity": {"id": rfids}}) # print len(self.iocs.keys()), for res in self.rfqapi.paged_query(q): refs.extend([inst for inst in res['instances'] if inst['id'] not in seen_ids]) seen_ids.update([inst['id'] for inst in res['instances']]) edetails.update({ eid: res['entities'][eid] for eid in res['entities'] if res['entities'][eid]['type'] in self._RELATED_ENTITY_TYPES}) return refs, edetails def _get_docs(self): all_docs = set() for names in _chunks(self.iocs.keys(), 250): q = {"instance": {"type": "Event"}, "output": {"count": {"axis": [{"name": "attributes.entities", "type": [self.entity_type], "aspect": "name"}, "document"], "values": [self._INSTANCES_OR_DOCUMENTS]}}} q['instance']['attributes'] = [[{"name": "Event.event_fragment", 'string': names}]] rfids = [self.iocs[name] for name in names if self.iocs[name]] q['instance']['attributes'][0].append({"name": "entities", "entity": {"id": rfids}}) res = self.rfqapi.query(q) counts = res["counts"][0] if len(counts) != 0: for ioc in filter(lambda i: i in self.iocs, counts): docids = counts[ioc].keys() self.response[ioc]['DocumentIds'] = docids all_docs.update(docids) return list(all_docs) def _safe_get_related_entities_from_frags(self, refs, edetails): ioc_to_rfid = self.iocs rfid_to_ioc = {} for ioc in filter(lambda i: ioc_to_rfid[i], ioc_to_rfid): rfid_to_ioc[ioc_to_rfid[ioc]] = ioc entities_to_lookup = set() for ref in refs: related_ents = ref['attributes'].get('extended_entities', ref['attributes'].get('entities', [])) entities_to_lookup.update([eid for eid in related_ents if eid not in edetails]) # print "Updating entity resolution" edetails.update(self._resolve_related_entities(list(entities_to_lookup))) # print "Updated related entities" for ref in refs: fragment = ref['fragment'].lower() # get related entities from reference related_ents = ref['attributes'].get('extended_entities', ref['attributes'].get('entities', [])) # get entities mentioned rfids = filter(lambda ioc: ioc in rfid_to_ioc, related_ents) ioc_rfids = [rfid for rfid in rfids if rfid in rfid_to_ioc] # get string hits that aren't included in the entity hits other_hits = [ioc for ioc in ioc_to_rfid if (ioc in fragment and ioc_to_rfid[ioc] not in ioc_rfids)] iocs = [rfid_to_ioc[rfid] for rfid in ioc_rfids] for ioc in iocs + other_hits: ioc_resp = self.response[ioc] for ent in filter(lambda eid: eid in edetails and eid != ioc_resp['RFID'], related_ents): etype, name = edetails[ent]['type'], edetails[ent]['name'] if name not in ioc_resp['Related' + etype]: ioc_resp['Related' + etype].append(name) for ioc in self.response: ioc_resp = self.response[ioc] for etype in self._RELATED_ENTITY_TYPES: if 'Related' + etype + 'Count' in ioc_resp: ioc_resp['Related' + etype + 'Count'] = len(ioc_resp['Related' + etype]) if 'Related' + etype not in self.keys and 'Related' + etype in ioc_resp: del ioc_resp['Related' + etype] def _resolve_related_entities(self, eids): if len(eids) == 0: return {} results = {} for ents in _chunks(eids, 250): q = {"entity": {"id": ents, "limit": 1001}} res = self.rfqapi.query(q) results.update({ eid: res['entity_details'][eid] for eid in res['entity_details'] if res['entity_details'][eid]['type'] in self._RELATED_ENTITY_TYPES }) return results def _safe_get_related_entities_from_docs(self, docs): for docids in _chunks(docs, 250): q = {"instance": {"type": "Event", "document": {"id": docids}}, "output": {"count": {"axis": ["document", {"name": "attributes.entities", "type": self._RELATED_ENTITY_TYPES, "aspect": "all"}], "values": [self._INSTANCES_OR_DOCUMENTS]}}} res = self.rfqapi.query(q) counts = res['counts'][0] for ioc in self.response: ioc_resp = self.response[ioc] for docid in filter(lambda did: did in counts, ioc_resp['DocumentIds']): for asp_name in filter(lambda n: n != 'NONE', counts[docid]): name, unused, etype = rf_agg_name_parser(asp_name) if name == ioc: continue # update related counts if name not in ioc_resp['Related' + etype]: ioc_resp['Related' + etype].append(name) for ioc in self.response: ioc_resp = self.response[ioc] if 'DocumentIds' not in self.keys and 'DocumentIds' in ioc_resp: del ioc_resp['DocumentIds'] for etype in self._RELATED_ENTITY_TYPES: if 'Related' + etype + 'Count' in ioc_resp: ioc_resp['Related' + etype + 'Count'] = len(ioc_resp['Related' + etype]) if 'Related' + etype not in self.keys and 'Related' + etype in ioc_resp: del ioc_resp['Related' + etype] def _get_rfids(self, iocs): new_iocs = collections.OrderedDict() edetails = {} for names in _chunks(iocs, 250): if len(names) == 0: continue q = {"entity": {"name": names, "type": self.entity_type, "limit": 501}} res = self.rfqapi.query(q) if len(res['entities']) == 0: continue for ent in res['entities']: edetails[res['entity_details'][ent]['name']] = ent for ioc in iocs: new_iocs[ioc] = edetails[ioc] if ioc in edetails else None return new_iocs