Exemplo n.º 1
0
    def ClusterHouses(matches, plot_groups=False):
        groups = {}
        try:
            N = len(matches)
            X = np.zeros((N, 2))
            for m in range(N):
                loc = RFAPI.house_location(matches[m])
                #logging.debug("ClusterHouses({})".format(loc))
                X[m] = (loc[0], loc[1])

            params = {
                'quantile': .3,
                'eps': .15,
                'damping': .9,
                'preference': -5,
                'n_neighbors': 2,
                'n_clusters': 5
            }

            # a bit buggy..
            spectral = cluster.SpectralClustering(
                n_clusters=params['n_clusters'],
                eigen_solver='arpack',
                affinity="nearest_neighbors")

            # best so far!
            gmm = mixture.GaussianMixture(n_components=params['n_clusters'],
                                          covariance_type='full')

            # yielded one cluster..
            affinity_propagation = cluster.AffinityPropagation(
                damping=params['damping'], preference=params['preference'])

            bandwidth = cluster.estimate_bandwidth(X,
                                                   quantile=params['quantile'])
            ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)

            algorithm = ms

            algorithm.fit(X)
            if hasattr(algorithm, 'labels_'):
                y_pred = algorithm.labels_.astype(np.int)
            else:
                y_pred = algorithm.predict(X)
            for m in range(len(matches)):
                key = str(y_pred[m])
                if groups.get(key, None) == None:
                    groups[key] = []

                groups[key].append({
                    "adress": RFAPI.house_address(matches[m]),
                    "location": [X[m][0], X[m][1]]
                })
            logging.debug("groups = {}".format(groups))
            if plot_groups:
                HouseScore._plot_groups(X, y_pred)
        except Exception as e:
            groups["error"] = str(e)
            logging.error(groups["error"])
        return groups
Exemplo n.º 2
0
def main():
    options, args = parse_arguments()
    query = build_query(options, args)

    api = RFAPI(options.token)

    substitute_fields = ["attributes"]
    output_columns = [
        "id",
        "momentum",
        "positive",
        "negative",
        "canonical.id",
        "type",
        "document.id",
        "document.published",
        "document.downloaded",
        "start",
        "stop",
        "document.url",
        "document.title",
        "document.sourceId.id",
        "document.sourceId.name",
        "document.sourceId.media_type",
        "document.sourceId.topic",
        "document.sourceId.country",
        "fragment",
        "attributes",
    ]
    entity_columns = ["id", "name", "hits", "type", "momentum", "attributes"]

    out = csv.DictWriter(sys.stdout, output_columns, extrasaction="ignore")

    if query.get("aggregate") or query.get("output", {}).get("count"):
        res = api.query(query)
        print res
    else:
        if options.header:
            out.writerow(dict(zip(output_columns, output_columns)))
        if options.entityfile:
            entityout = csv.DictWriter(open(options.entityfile, "w"), entity_columns, extrasaction="ignore")
            entityout.writerow(dict(zip(entity_columns, entity_columns)))

    for res in api.paged_query(query):
        for i in res["instances"]:
            i["positive"] = i.get("attributes", {}).get("positive", 0.0)
            i["negative"] = i.get("attributes", {}).get("negative", 0.0)
            out.writerow(encode_instance(flatten_instance(i, res["entities"], substitute_fields)))

        if options.entityfile:
            entities = pack_entity_attributes(res["entities"], entity_columns)
            for e in entities:
                # Here we reuse the instance formatting code to format entities for output.
                entityout.writerow(encode_instance(flatten_instance(e, res["entities"], [])))

        if not options.page:
            break
Exemplo n.º 3
0
def main():
    options, args = parse_arguments()
    query = build_query(options, args)

    api = RFAPI(options.token)

    substitute_fields = ['attributes']
    output_columns = [
        'id', 'momentum', 'positive', 'negative', 'canonical.id', 'type',
        'document.id', 'document.published', 'document.downloaded', 'start',
        'stop', 'document.url', 'document.title', 'document.sourceId.id',
        'document.sourceId.name', 'document.sourceId.media_type',
        'document.sourceId.topic', 'document.sourceId.country', 'fragment',
        'attributes'
    ]
    entity_columns = ['id', 'name', 'hits', 'type', 'momentum', 'attributes']

    out = csv.DictWriter(sys.stdout, output_columns, extrasaction='ignore')

    if query.get('aggregate') or query.get('output', {}).get('count'):
        res = api.query(query)
        print res
        return

    if options.header:
        out.writerow(dict(zip(output_columns, output_columns)))
    if options.entityfile:
        entityout = csv.DictWriter(open(options.entityfile, 'w'),
                                   entity_columns,
                                   extrasaction='ignore')
        entityout.writerow(dict(zip(entity_columns, entity_columns)))

    for res in api.paged_query(query):
        for i in res['instances']:
            i['positive'] = i.get('attributes', {}).get('positive', 0.0)
            i['negative'] = i.get('attributes', {}).get('negative', 0.0)
            out.writerow(
                encode_instance(
                    flatten_instance(i, res['entities'], substitute_fields)))

        if options.entityfile:
            entities = pack_entity_attributes(res['entities'], entity_columns)
            for e in entities:
                #Here we reuse the instance formatting code to format entities for output.
                entityout.writerow(
                    encode_instance(flatten_instance(e, res['entities'], [])))

        if not options.page:
            break
Exemplo n.º 4
0
    def distance(self, house, details):
        ret = 0.0
        median, div, cutoff, weight = self._get_measure_facts("distance")
        fav_len = len(self.fav_locations)
        if fav_len == 0 or house.get("parcel") is None or house.get(
                "parcel").get("longitude") is None:
            house_address = RFAPI.house_address(house)
            if house_address != "":
                loc = AddressToLocation(house_address)
                if loc is not None and len(loc) == 2:
                    house["parcel"] = {"latitude": loc[0], "longitude": loc[1]}
            return HouseScoreResult(cutoff * weight, cutoff, False,
                                    "Can't measure distance")

        for fav in self.fav_locations:
            dist = LocationDistance(
                [house["parcel"]["latitude"], house["parcel"]["longitude"]],
                fav["loc"])
            diff = dist / div
            ret = ret + diff

        distance_average = ret / fav_len
        money = distance_average * weight
        message = "" if distance_average <= cutoff else "distance {}(mil) is larger than cut of {}(mil)".format(
            distance_average, cutoff)
        return HouseScoreResult(money, distance_average,
                                (distance_average < cutoff), message)
Exemplo n.º 5
0
    def post_process_one(self, m, search_name, get_details=False, force=False):
        house_details = None
        if get_details:
            house_details = self.rfapi.get_house_details(m,
                                                         force=force,
                                                         cache_time_format="")

        #logging.debug("post_process(get_details={})=>{}".format(get_details, house_details))
        RFAPI.generate_url_for_house(m)
        ha = RFAPI.house_address_parts(m)
        scores = None
        try:
            scores = self.get_scores(m, house_details)
        except:
            logging.error("house : {}, throw {}".format(
                json.dumps(m), traceback.format_exc()))
        if scores is None:
            return m

        m['scores'] = scores
        is_good, message = HouseScore.get_house_score_message(m)

        if is_good:
            if house_details is None:
                house_details = self.rfapi.get_house_details(
                    m, force=force, cache_time_format="")
                scores = self.get_scores(m, house_details)
            house_neighborhoods = RFAPI.house_neighborhoods(m, house_details)
            m['scores'] = scores
            is_good, message = HouseScore.get_house_score_message(m)

            logging.info(message)
            # getting city data takes a long time, will do it only for winning houses!
            """
            self.city_data.get_data(
                house_address= ha['display']
                , city=ha['city']     # Everett
                , state_short=ha['state']
                , zip_code=ha['zip']
                , house_neighborhoods=house_neighborhoods
                , force=force)
            """
        else:
            logging.debug(message)

        return m
Exemplo n.º 6
0
 def __init__(self, token, iocs, entity_type, mode='core'):
     '''
     Parameters
     ----------
     token : str
         Recorded Future API token
     iocs : list or dict
         List of IOCs to enrich or dict of IOCs keyed by name with the value as the RFID. 
     entity_type : {"IpAddress", "Hash", "InternetDomainName"}
         Name of Recorded Future entity type for IOC.
     mode : {"core", "related", "debug"}
         Subset of features to return with enrichment. "core" is default.
     '''
     self.rfqapi = RFAPI(token)
     self.response = collections.OrderedDict()
     # need all features early for scoring; they're removed later
     # need to test whether this can be avoided
     keys = self._FEATURES['core']
     keys.update(self._FEATURES['debug'])
     if mode in ('related', 'debug'):
         keys.update(self._FEATURES['related'])
     if mode not in ('core', 'related', 'debug'):
         raise ValueError(
             '"mode" must be one of ("core", "related", "debug"). Input: %s.'
             % mode)
     self.mode = mode
     self.entity_type = entity_type
     if isinstance(iocs, list):
         self.iocs = self._get_rfids(iocs)
     elif isinstance(iocs, dict):
         self.iocs = iocs
     else:
         raise ValueError('"iocs" must be list or dict.')
     for ioc in self.iocs:
         new_resp = {}
         for key in keys:
             new_resp[key] = keys[key]
             if key == 'Name':
                 new_resp[key] = ioc
             elif key == 'RFID':
                 new_resp[key] = self.iocs[ioc]
             elif key == 'EntityType':
                 new_resp[key] = self.entity_type
         self.response[ioc] = new_resp
     self.keys = keys
Exemplo n.º 7
0
    def get_scores(self, house, details=None):
        scores = {}
        total_score = 0.0
        for k in self.default_fields:
            if k == "value": continue
            method = getattr(self, k, None)
            if method is None: continue
            result = method(house, details)
            scores[k] = dict(**result._asdict())
            total_score += result.money
            if not result.accepted:
                scores["cutoff"] = ",".join(
                    filter(None, [scores.get("cutoff"), k]))

        # value evaluation must be run last
        result = self.value(house, details, total_score)
        scores["value"] = dict(**result._asdict())
        if not result.accepted:
            scores["cutoff"] = ",".join(
                filter(None, [scores.get("cutoff"), "value"]))

        scores["facts"] = {
            "build":
            house.get("year_built", -1),
            "full_address":
            RFAPI.house_address(house),
            "beds":
            house["beds"],
            "sqft":
            house["sqft"] if house.get("sqft") is not None else 0.0,
            "baths":
            house.get("baths", 0),
            "price":
            house["price"],
            "County":
            RFAPI.house_county(house, details),
            "photo":
            RFAPI.house_photo_url(house, details),
            "neighborhoods":
            RFAPI.house_neighborhoods(house, details)
            if details is not None else []
        }
        return scores
Exemplo n.º 8
0
    def __init__(self,
                 default_fields=None,
                 fav_locations=None,
                 cahe_folder=CACHE_DIR):
        super(HouseScore, self).__init__()

        self.cahe_folder = cahe_folder
        self.rfapi = RFAPI(cahe_folder=cahe_folder)

        if default_fields is not None:
            self.default_fields = default_fields
        else:
            self.default_fields = HouseScore.DEFAULTS.copy()

        if fav_locations is not None:
            self.fav_locations = fav_locations
        else:
            self.fav_locations = HouseScore.LoadFavorits(
                os.path.join(SCRIPT_DIR, "FavoriteLocations.json"))
Exemplo n.º 9
0
def main():
    options, args = parse_arguments()
    query = build_query(options, args)

    api = RFAPI(options.token)

    substitute_fields = ['attributes']
    output_columns = ['id', 'momentum', 'positive', 'negative', 'canonical.id',
            'type', 'document.id', 'document.published', 'document.downloaded',
            'start', 'stop', 'document.url','document.title',
            'document.sourceId.id', 'document.sourceId.name',
            'document.sourceId.media_type', 'document.sourceId.topic',
            'document.sourceId.country', 'fragment', 'attributes']
    entity_columns = ['id', 'name', 'hits', 'type', 'momentum', 'attributes']

    out = csv.DictWriter(sys.stdout, output_columns, extrasaction='ignore')

    if query.get('aggregate') or query.get('output', {}).get('count'):
        res = api.query(query)
        print res
        return
        
    if options.header:
        out.writerow(dict(zip(output_columns, output_columns)))
    if options.entityfile:
        entityout = csv.DictWriter(open(options.entityfile, 'w'), entity_columns, extrasaction='ignore')
        entityout.writerow(dict(zip(entity_columns, entity_columns)))

    for res in api.paged_query(query):
        for i in res['instances']:
            i['positive'] = i.get('attributes', {}).get('positive', 0.0)
            i['negative'] = i.get('attributes', {}).get('negative', 0.0)
            out.writerow(encode_instance(flatten_instance(i, res['entities'], substitute_fields)))

        if options.entityfile:
            entities = pack_entity_attributes(res['entities'], entity_columns)
            for e in entities:
                #Here we reuse the instance formatting code to format entities for output.
                entityout.writerow(encode_instance(flatten_instance(e, res['entities'], [])))

        if not options.page:
            break
Exemplo n.º 10
0
def get_all_iocs(token, e_type, index_min, index_max):
    '''Gets all entities of type e_type found between
    index_min and index_max
    '''
    rfqapi = RFAPI(token)
    q = {"instance": {"type": "Event",
                      "attributes": [{"entity": {"type": e_type}}],
                      "document": {"indexed": {"min": index_min,
                                               "max": index_max}}},
         "output": {"count": {"axis":[{"name":"attributes.entities",
                                       "type":e_type,
                                       "aspect":"all"}],
                              "values":["instances"]}}}
    res = rfqapi.query(q)
    iocs = res["counts"][0].keys()
    ioc_dict = {}
    for ioc in iocs:
        ioc_name, rfid, unused = rf_agg_name_parser(ioc)
        ioc_dict[ioc_name] = rfid
    return ioc_dict
Exemplo n.º 11
0
def main():
    # Construct a RFAPI query object 
    rfqapi = RFAPI(TOKEN)

    # Query for the metadata 
    mdata_result = rfqapi.paged_query(q)

    # Loop over all the metadata and each metadata attributes 
    for metadata in mdata_result:
        mdata_types =  metadata['types']
        for md_type in mdata_types:
            # Print each Root Metadata Type 
            parent_type=""
            if 'parent' in md_type:
                parent_type = str(md_type['parent'])
            print md_type['name']+"("+parent_type+")"

            # Loop over attributes in this metadata type and print their corresponding types 
            for md_attr_list in md_type['attrs']:
                print_attributes(md_attr_list)
Exemplo n.º 12
0
    def get_house_score_message(m):
        scores = m['scores']
        id_str = RFAPI.house_address(m)

        if m.get("URL") is not None:
            id_str = "{} : http://www.redfin.com{}".format(id_str, m["URL"])

        if scores.get("cutoff") is not None:
            return False, "{} => Cut for {{ {} }} score = {}".format(
                id_str, scores["cutoff"], scores)
        else:
            return True, ("{} => {}".format(id_str, scores))
Exemplo n.º 13
0
def get_all_iocs(token, e_type, index_min, index_max):
    '''Gets all entities of type e_type found between
    index_min and index_max
    '''
    rfqapi = RFAPI(token)
    q = {
        "instance": {
            "type": "Event",
            "attributes": [{
                "entity": {
                    "type": e_type
                }
            }],
            "document": {
                "indexed": {
                    "min": index_min,
                    "max": index_max
                }
            }
        },
        "output": {
            "count": {
                "axis": [{
                    "name": "attributes.entities",
                    "type": e_type,
                    "aspect": "all"
                }],
                "values": ["instances"]
            }
        }
    }
    res = rfqapi.query(q)
    iocs = res["counts"][0].keys()
    ioc_dict = {}
    for ioc in iocs:
        ioc_name, rfid, unused = rf_agg_name_parser(ioc)
        ioc_dict[ioc_name] = rfid
    return ioc_dict
Exemplo n.º 14
0
 def __init__(self, token, iocs, entity_type, mode='core'):
     '''
     Parameters
     ----------
     token : str
         Recorded Future API token
     iocs : list or dict
         List of IOCs to enrich or dict of IOCs keyed by name with the value as the RFID. 
     entity_type : {"IpAddress", "Hash", "InternetDomainName"}
         Name of Recorded Future entity type for IOC.
     mode : {"core", "related", "debug"}
         Subset of features to return with enrichment. "core" is default.
     '''
     self.rfqapi = RFAPI(token)
     self.response = collections.OrderedDict()
     # need all features early for scoring; they're removed later
     # need to test whether this can be avoided
     keys = self._FEATURES['core']
     keys.update(self._FEATURES['debug'])
     if mode in ('related', 'debug'):
         keys.update(self._FEATURES['related'])
     if mode not in ('core', 'related', 'debug'):
         raise ValueError('"mode" must be one of ("core", "related", "debug"). Input: %s.' % mode)
     self.mode = mode
     self.entity_type = entity_type
     if isinstance(iocs, list):
         self.iocs = self._get_rfids(iocs)
     elif isinstance(iocs, dict):
         self.iocs = iocs
     else:
         raise ValueError('"iocs" must be list or dict.')
     for ioc in self.iocs:
         new_resp = {}
         for key in keys:
             new_resp[key] = keys[key]
             if key == 'Name':
                 new_resp[key] = ioc
             elif key == 'RFID':
                 new_resp[key] = self.iocs[ioc]
             elif key == 'EntityType':
                 new_resp[key] = self.entity_type
         self.response[ioc] = new_resp
     self.keys = keys
Exemplo n.º 15
0
        risk = int(row['Risk'])
        if risk >= args.ip_risk_floor:
            ip_form = row['Name']
            if '/' in ip_form:
                # We don't want to include CIDR ranges.
                continue

            print('\t'.join([
                ip_form, 'Intel::ADDR',
                meta_source, intel_summ_link('ip:'+ip_form),
                do_notice, '-'
                ]))
            c += 1

# Hashes.
api = RFAPI(args.token)
hash_query = {
  "cluster": {
    "data_group": "Hash",
    "limit": 10000,
    "attributes": [
      {
        "name": "stats.metrics.riskScore",
        "range": {
          "gte": args.hash_risk_floor
        }
      }
    ]
  },
  "output": {
    "exclude": [
class RFEnricher(object):
    pattern_to_ioc = {  
        '{0}|{1}'.format(ipv4_regexp, ipv6_regexp):
        {
            'data_group': 'EnrichIpAddress',
            'function': 'enriched-ip-address',
            'id_prefix': 'ip'
        },
        idn_regexp: {
            'data_group': 'EnrichInternetDomainName',
            'function': 'enriched-internet-domain-name',
            'id_prefix': 'idn'        
        },
        hash_regexp: {
            'data_group': 'EnrichHash',
            'function': 'enriched-hash',
            'id_prefix': 'hash'
        }
    }

    def __init__(self, token):
        self.rfapi = RFAPI(token)

    def enrich(self, iocs):
        sys.stderr.write('Enriching {0} IOC(s)...\n'.format(len(iocs)))
        enrichment = {}
        for ioc in iocs:
            for pattern, query_config in self.pattern_to_ioc.items():
                if not re.match(pattern, ioc):
                    continue
                sys.stderr.write('\tProcessing {0} : {1}... '.format(query_config['id_prefix'], ioc))
                enrichment[ioc] = self.query_enrich_ioc(ioc, query_config)
                sys.stderr.write('Done.\n')
                break
            else:
                sys.stderr.write('Unable to match "{0}" with any supported IOC type.\n'.format(ioc))
        return enrichment

    def get_entity_id(self, id_prefix, name):
        if id_prefix != 'hash': 
            return "{0}:{1}".format(id_prefix, name)
        res = self.rfapi.query({
            'entity': {
                'name': name,
                'type': 'Hash',
                'limit': 1
            }
        })
        if len(res.get('entities', [])) == 0:
            return None
        return res['entities'][0]


    def query_enrich_ioc(self, text, query_config):
        entity_id = self.get_entity_id(query_config['id_prefix'], text)
        if not entity_id:
            return "No enrichment available."
        q = {
          "cluster": {
            "function": query_config['function'],
            "attributes": [
              {
                "entity": {
                  "id": entity_id
                }
              }
            ],
            "limit": 1,
            "data_group": query_config['data_group']
          },
          "output": {
            "inline_entities": True
          }
        }
        res = self.rfapi.query(q)
        if res['count']['events']['total'] == 0:
            return "No enrichment available."
        enr_data = res['events'][0]['stats']
        enr_data['rf_link'] = 'https://www.recordedfuture.com/live/sc/entity/' + entity_id
        return enr_data
Exemplo n.º 17
0
    for row in csv_fd:
        risk = int(row['Risk'])
        if risk >= args.ip_risk_floor:
            ip_form = row['Name']
            if '/' in ip_form:
                # We don't want to include CIDR ranges.
                continue

            print('\t'.join([
                ip_form, 'Intel::ADDR', meta_source,
                intel_summ_link('ip:' + ip_form), do_notice, '-'
            ]))
            c += 1

# Hashes.
api = RFAPI(args.token)
hash_query = {
    "cluster": {
        "data_group":
        "Hash",
        "limit":
        10000,
        "attributes": [{
            "name": "stats.metrics.riskScore",
            "range": {
                "gte": args.hash_risk_floor
            }
        }]
    },
    "output": {
        "exclude": ["stats.entity_lists"],
Exemplo n.º 18
0
def main():
    options, args = parse_arguments()
    query = build_query(options, args)
    api = RFAPI(options.token)
    res = api.query(query)
    print res
 def __init__(self, token):
     self.rfapi = RFAPI(token)
Exemplo n.º 20
0
    for f in toplists:
        w = csv.writer(open(f + '.csv', 'wb'), dialect='toplists')
        header = ['Name', 'Hits']
        if bools['assoc']: header += ['Associated Entities']
        header += ['Link']
        w.writerow(header)
        w.writerows(toplists[f])

def get_arguments():
    parser = argparse.ArgumentParser(description='Pull top lists from Recorded Future.')
    parser.add_argument('token', help="Recorded Future API token.")
    parser.add_argument('n', help="Number of results to return per query.", type=int)
    parser.add_argument('period', help="Number of days back to query.", type=int)
    parser.add_argument('query_file', nargs='+', help="Query files.")
    parser.add_argument('-assoc', '--assoc', help="Include associated products, techs, malware.", action='store_true')
    parser.add_argument('-new', '--new', help="Include only entities first seen in the last %s days." % emerging_threshold.days, action='store_true')
    return parser.parse_args()

if __name__ == '__main__':
    args = get_arguments()
    token = args.token
    n = args.n
    period = args.period
    files = args.query_file
    bools = {"assoc": args.assoc,
             "new": args.new}
    rfqapi = RFAPI(token)
    queries = get_queries(files)
    toplists = get_toplists(queries, rfqapi, n, period, bools)
    write_details(toplists, bools)
Exemplo n.º 21
0
            "not": {
                "ip": "192.168.0.0/16"
            }
        }, {
            "not": {
                "ip": "127.0.0.1"
            }
        }, {
            "not": {
                "ip": "0.0.0.0"
            }
        }],
        "limit":
        10
    },
    "output": {
        "exclude": ["stats"],
        "inline_entities": True
    }
}

# Using RFAPI module, run query
# Note: To pull back all results, use rfqapi.paged_query(q)
# and a higher limit.
rfqapi = RFAPI(token)
result = rfqapi.query(q)

# Display the results (in this case, limit is 1)
for res in result['events']:
    print "Event: \n"
    print str(res) + '\n'
Exemplo n.º 22
0
class RFEnricher(object):
    pattern_to_ioc = {
        '{0}|{1}'.format(ipv4_regexp, ipv6_regexp): {
            'data_group': 'IpAddress',
            'id_prefix': 'ip'
        },
        idn_regexp: {
            'data_group': 'InternetDomainName',
            'id_prefix': 'idn'
        },
        hash_regexp: {
            'data_group': 'Hash',
            'id_prefix': 'hash'
        }
    }

    def __init__(self, token):
        self.rfapi = RFAPI(token)

    def enrich(self, iocs):
        sys.stderr.write('Enriching {0} IOC(s)...\n'.format(len(iocs)))
        enrichment = {}
        for ioc in iocs:
            for pattern, query_config in self.pattern_to_ioc.items():
                if not re.match(pattern, ioc):
                    continue
                sys.stderr.write('\tProcessing {0} : {1}... '.format(
                    query_config['id_prefix'], ioc))
                enrichment[ioc] = self.query_enrich_ioc(ioc, query_config)
                sys.stderr.write('Done.\n')
                break
            else:
                sys.stderr.write(
                    'Unable to match "{0}" with any supported IOC type.\n'.
                    format(ioc))
        return enrichment

    def get_entity_id(self, id_prefix, name):
        if id_prefix != 'hash':
            return "{0}:{1}".format(id_prefix, name)
        res = self.rfapi.query(
            {'entity': {
                'name': name,
                'type': 'Hash',
                'limit': 1
            }})
        if len(res.get('entities', [])) == 0:
            return None
        return res['entities'][0]

    def query_enrich_ioc(self, text, query_config):
        entity_id = self.get_entity_id(query_config['id_prefix'], text)
        if not entity_id:
            return "No enrichment available."
        q = {
            "cluster": {
                "attributes": [{
                    "entity": {
                        "id": entity_id
                    }
                }],
                "limit": 1,
                "data_group": query_config['data_group']
            },
            "output": {
                "inline_entities": True
            }
        }
        res = self.rfapi.query(q)
        if res['count']['events']['total'] == 0:
            return "No enrichment available."
        enr_data = res['events'][0]['stats']
        enr_data[
            'rf_link'] = 'https://www.recordedfuture.com/live/sc/entity/' + entity_id
        return enr_data
Exemplo n.º 23
0
class HouseScore(object):
    DEFAULTS = {
        "distance": {
            "median": 8.0,  # stay in King County
            "div": 0.5,
            "weight": -9000.0,  # one mile costs ( $900 / year ) * 10 years
            "cutoff": 18.0
        },
        "area": {
            "median": 2700.0,  # We are looking for a house around this area
            "div": 1.0,
            "weight": 225.0,  # a good house would cost $275 per SF
            "cutoff": 1900.0
        },
        "build": {
            "median": 2018.0,
            "div": 1.0,
            "weight": -2000.0,
            "cutoff": 1990.0
        },
        "beds": {
            "median": 4.0,
            "div": 1.0,
            "weight": 10000.0,  # for each extra room you get 10K
            "cutoff": 3.0
        },
        "backyard": {
            "median": 2500.0,  # not used
            "div": 1.0,
            "weight": 5.0,  # I'd pay extra 25K for 5000 SF back yard
            "cutoff": 500.0
        },
        "crime": {
            "median": 236.5,
            "div": 1.0,
            "weight": 1.0,
            "cutoff": -10.0
        },
        "history": {
            "median": 7.0,
            "div": 1000.0,  # lose 0.1% every day on market
            "weight": 1.0,  # penelty = dom * price / div
            "cutoff":
            155.0,  # if no human finds this house good for 5 months, don't consider it!
            "history_days": 365.0,
            "pending_penelty": 10000.0,
            "inspection_penelty": 25000.0,
            "delisted_penelty": 5000.0
        },
        "layout": {
            "median": 0.0,  # not used
            "div": 0.01,
            "weight": 1.0,
            "cutoff": 0.0,  # not used
            "beds_min": 3.0,
            "bed_bonus": 10000.0,
            "baths_min": 2.5,
            "bath_bonus": 2000.0,
            "required": {
                "Attached Garage": 5000.0,
                "Living Room": 5000.0,
                "Dining Room": 5000.0
            },
            "optional": {
                "Bonus Room": 1000.0,
                "Family Room": 5000.0,
                "Recreation Room": 1000.0,
                "Walk-In Closet": 1000.0,
                "Utility Room": 1000.0,
                "Loft": 1000.0,
                "Den": 3000.0,
                "Office": 1000.0
            }
        },
        "amenities": {
            "median": 95.0,  # not used
            "div": 0.01,
            "weight": 1.0,
            "cutoff": 80.0,  # should have 80% of what we are looking for
            "required": {
                "Forced Air Heating": 10000.0,
                "Dishwasher": 2000.0,
                "Dryer": 1000.0,
                "Oven": 1000.0,
                "Refrigerator": 3000.0,
                "'Washer'": 1000.0,
                "Public Water Source": 5000.0,
                "Sewer Connected": 5000.0,
                "Garbage Disposal": 1000.0,
                "High Speed Internet": 5000.0
            },
            "optional": {
                "Microwave": 500.0,
                "Composition Roof": 2000.0,
                "Central Air Conditioning": 10000.0,
                "'King County'": 100000.0,
                "'Bothell'": 50000.0,
                "'Kenmore'": 50000.0,
                "'Brier'": 20000.0
            }
        },
        "value": {
            # always run this last
            "total_cutoff": 500000.0,
            "percentage_cutoff": -50.0
        }
    }

    def __init__(self,
                 default_fields=None,
                 fav_locations=None,
                 cahe_folder=CACHE_DIR):
        super(HouseScore, self).__init__()

        self.cahe_folder = cahe_folder
        self.rfapi = RFAPI(cahe_folder=cahe_folder)

        if default_fields is not None:
            self.default_fields = default_fields
        else:
            self.default_fields = HouseScore.DEFAULTS.copy()

        if fav_locations is not None:
            self.fav_locations = fav_locations
        else:
            self.fav_locations = HouseScore.LoadFavorits(
                os.path.join(SCRIPT_DIR, "FavoriteLocations.json"))

    def value(self, house, details=None, total_score=0.0):
        total_cutoff = self.default_fields["value"]["total_cutoff"]
        percentage_cutoff = self.default_fields["value"]["percentage_cutoff"]

        details_available = details is not None

        message = []
        accepted = True

        if total_score < total_cutoff and details_available:
            accepted = False
            message.append("house valued at {} which is less than {}".format(
                total_score, total_cutoff))

        gain = total_score - float(house["price"])
        gain_percentage = gain / total_score * 100
        if gain_percentage < percentage_cutoff and details_available:
            accepted = False
            message.append("house ROI of {} which is less than {}".format(
                gain_percentage, percentage_cutoff))

        return HouseScoreResult(total_score, gain_percentage, accepted,
                                ", ".join(message))

    def get_scores(self, house, details=None):
        scores = {}
        total_score = 0.0
        for k in self.default_fields:
            if k == "value": continue
            method = getattr(self, k, None)
            if method is None: continue
            result = method(house, details)
            scores[k] = dict(**result._asdict())
            total_score += result.money
            if not result.accepted:
                scores["cutoff"] = ",".join(
                    filter(None, [scores.get("cutoff"), k]))

        # value evaluation must be run last
        result = self.value(house, details, total_score)
        scores["value"] = dict(**result._asdict())
        if not result.accepted:
            scores["cutoff"] = ",".join(
                filter(None, [scores.get("cutoff"), "value"]))

        scores["facts"] = {
            "build":
            house.get("year_built", -1),
            "full_address":
            RFAPI.house_address(house),
            "beds":
            house["beds"],
            "sqft":
            house["sqft"] if house.get("sqft") is not None else 0.0,
            "baths":
            house.get("baths", 0),
            "price":
            house["price"],
            "County":
            RFAPI.house_county(house, details),
            "photo":
            RFAPI.house_photo_url(house, details),
            "neighborhoods":
            RFAPI.house_neighborhoods(house, details)
            if details is not None else []
        }
        return scores

    def _get_measure_facts(self, measure_name):
        median = self.default_fields[measure_name]["median"]
        div = self.default_fields[measure_name]["div"]
        cutoff = self.default_fields[measure_name]["cutoff"]
        weight = self.default_fields[measure_name]["weight"]
        return (median, div, cutoff, weight)

    def area(self, house, details):
        median, div, cutoff, weight = self._get_measure_facts("area")
        if house.get("sqft") is None:
            return HouseScoreResult(0.0, 0.0, False, "missing 'sqft'")
        area = house["sqft"]
        money = area * weight
        message = "" if area >= cutoff else "area {}(sf) is less than cut of {}(sf)".format(
            area, cutoff)
        return HouseScoreResult(money, area, (area >= cutoff), message)

    def build(self, house, details):
        median, div, cutoff, weight = self._get_measure_facts("build")
        if house.get("year_built") is None:
            return HouseScoreResult(0.0, 0.0, False, "missing 'year_built'")
        year_built = house["year_built"]
        money = (median - year_built) / div * weight
        message = "" if year_built >= cutoff else "House is built in {}, older than cut of {}".format(
            year_built, cutoff)
        return HouseScoreResult(money, year_built, (year_built >= cutoff),
                                message)

    def backyard(self, house, details):
        if house.get("lotsize") is None or house.get("sqft") is None:
            return HouseScoreResult(0.0, 0.0, False,
                                    "missing 'lotsize' or 'sqft'")
        median, div, cutoff, weight = self._get_measure_facts("backyard")
        remaining_for_backyard = float(house["lotsize"]) - float(house["sqft"])
        money = remaining_for_backyard * weight
        message = "" if remaining_for_backyard >= cutoff else "backyard {}(sf) is less than cut of {}(sf)".format(
            remaining_for_backyard, cutoff)
        return HouseScoreResult(money, remaining_for_backyard,
                                (remaining_for_backyard >= cutoff), message)

    def amenities(self, house, details):
        return self.amenitiesInfo("amenities", house, details)

    def layout(self, house, details):
        info = self.amenitiesInfo("layout", house, details)
        money_score = info.money
        # check beds / baths
        if house.get("beds") is not None:
            extra_beds = house["beds"] - self.default_fields["layout"][
                "beds_min"]
            money_score += extra_beds * self.default_fields["layout"][
                "bed_bonus"]
        if house.get("baths") is not None:
            extra_baths = house["baths"] - self.default_fields["layout"][
                "baths_min"]
            money_score += extra_baths * self.default_fields["layout"][
                "bath_bonus"]

        return HouseScoreResult(money_score, info.value, info.accepted,
                                info.message)

    def dom(self, house, details):
        median, div, cutoff, weight = self._get_measure_facts("history")
        now_epoch = time.time() * 1000.0
        oldest_epoch = now_epoch - self.default_fields["history"][
            "history_days"] * MS_IN_A_DAY

        house["dom_fixed"] = house["dom"] if house.get(
            "dom") is not None else 0.0
        if details is not None:
            oldest_history_after_sale = now_epoch
            for event in details["payload"]["propertyHistoryInfo"]["events"]:
                if "sold" in event["eventDescription"].lower() or event[
                        "eventDate"] < oldest_epoch:  # or event["historyEventType"] != 1
                    break  # assuming events are sorted!
                event_epoc = event["eventDate"]
                if event_epoc < oldest_history_after_sale:
                    #logging.debug("#1 oldest = {}, this event = {}".format(oldest_history_after_sale,event_epoc))
                    oldest_history_after_sale = event_epoc
            oldest_history_after_sale_days = (
                now_epoch - oldest_history_after_sale) / MS_IN_A_DAY
            #logging.debug("dom={}, dom_fixed={}".format(house["dom"], oldest_history_after_sale_days))
            house["dom_fixed"] = max(house["dom"],
                                     oldest_history_after_sale_days)
        if house.get("dom") is None:
            return HouseScoreResult(0.0, 0.0, False, "missing 'dom'")
        dom = house["dom_fixed"]
        money = (median - dom) / div * float(house["price"])
        message = "House was on the market for {} days".format(int(dom))
        if int(dom) != int(house["dom"]):
            message = message + " (reported {} days)".format(int(house["dom"]))
        if dom > cutoff:
            message = message + ", more than cut of {}".format(int(cutoff))
        return HouseScoreResult(money, dom, (dom < cutoff), message)

    @staticmethod
    def AddIfNotExists(list_to_append, element):
        if element not in list_to_append:
            list_to_append.append(element)
        return list_to_append

    def history(self, house, details):
        na = HouseScoreResult(0, 0.0, True, "Not enough details")
        if details is None:
            return na
        if details.get("payload") is None or details.get("payload").get(
                "propertyHistoryInfo") is None:
            return na
        median, div, cutoff, weight = self._get_measure_facts("history")
        messages = []
        dom_results = self.dom(house, details)
        messages.append(dom_results.message)

        epoch_days = time.time() * 1000.0 / MS_IN_A_DAY
        oldest_epoch_days = epoch_days - self.default_fields["history"][
            "history_days"]
        oldest_epoch_days = max(house["dom_fixed"], oldest_epoch_days)
        total_penelty = 0.0
        for event in details["payload"]["propertyHistoryInfo"]["events"]:
            event_epoc = event["eventDate"]
            event_epoc_days = event_epoc / MS_IN_A_DAY
            event_epoc_diff = int(epoch_days - event_epoc_days)
            event_str = json.dumps(event).lower()

            #logging.debug("#2 oldest = {}, this event = {}, diff = {}".format(oldest_epoch_days,event_epoc_days,event_epoc_diff))
            if event_epoc_days < oldest_epoch_days:
                continue

            #logging.debug("event = {}".format(event_str))

            if "inspection" in event_str:
                total_penelty -= self.default_fields["history"][
                    "inspection_penelty"]
                HouseScore.AddIfNotExists(
                    messages, "was pending inspection {} days ago".format(
                        event_epoc_diff))
            elif "pending" in event_str:
                total_penelty -= self.default_fields["history"][
                    "pending_penelty"]
                HouseScore.AddIfNotExists(
                    messages,
                    "was pending {} days ago".format(event_epoc_diff))
            elif event["eventDescription"].lower() in ["delisted", "relisted"]:
                total_penelty -= self.default_fields["history"][
                    "delisted_penelty"]
                HouseScore.AddIfNotExists(
                    messages,
                    "was relisted {} days ago".format(event_epoc_diff))

        return HouseScoreResult(total_penelty + dom_results.money,
                                dom_results.value, dom_results.accepted,
                                ", ".join(messages))

    def amenitiesInfo(self, key, house, details):
        na = HouseScoreResult(0, 0.0, True, "Not enough details")
        if details is None:
            return na
        if details.get("payload") is None or details.get("payload").get(
                "amenitiesInfo") is None:
            return na
        median, div, cutoff, weight = self._get_measure_facts(key)
        #logging.debug("amenities : detials = {}".format(details))
        amenities_str = str(details["payload"]).lower()
        #logging.debug("amenities_str = {}".format(amenities_str)) #disable me
        required_amenities_sum = 0.0
        amenities_score = 0.0
        missing_amenities = []
        for k in self.default_fields[key]["required"]:
            v = self.default_fields[key]["required"][k]
            required_amenities_sum = required_amenities_sum + v
            if k.lower() in amenities_str:
                amenities_score = amenities_score + v
            else:
                missing_amenities.append(k)

        for k in self.default_fields[key]["optional"]:
            if k.lower() in amenities_str:
                amenities_score = amenities_score + self.default_fields[key][
                    "optional"][k]
            else:
                missing_amenities.append(k)

        missing_amenities_message = "" if len(
            missing_amenities) == 0 else "missing : [ {} ]".format(
                ",".join(missing_amenities))
        percentage = amenities_score / required_amenities_sum / div
        message = missing_amenities_message if percentage >= cutoff else " percentage {} is less than cut of {}, {}".format(
            percentage, cutoff, missing_amenities_message)

        return HouseScoreResult(amenities_score, percentage,
                                (percentage >= cutoff), message)

    def distance(self, house, details):
        ret = 0.0
        median, div, cutoff, weight = self._get_measure_facts("distance")
        fav_len = len(self.fav_locations)
        if fav_len == 0 or house.get("parcel") is None or house.get(
                "parcel").get("longitude") is None:
            house_address = RFAPI.house_address(house)
            if house_address != "":
                loc = AddressToLocation(house_address)
                if loc is not None and len(loc) == 2:
                    house["parcel"] = {"latitude": loc[0], "longitude": loc[1]}
            return HouseScoreResult(cutoff * weight, cutoff, False,
                                    "Can't measure distance")

        for fav in self.fav_locations:
            dist = LocationDistance(
                [house["parcel"]["latitude"], house["parcel"]["longitude"]],
                fav["loc"])
            diff = dist / div
            ret = ret + diff

        distance_average = ret / fav_len
        money = distance_average * weight
        message = "" if distance_average <= cutoff else "distance {}(mil) is larger than cut of {}(mil)".format(
            distance_average, cutoff)
        return HouseScoreResult(money, distance_average,
                                (distance_average < cutoff), message)

    @staticmethod
    def get_house_score_message(m):
        scores = m['scores']
        id_str = RFAPI.house_address(m)

        if m.get("URL") is not None:
            id_str = "{} : http://www.redfin.com{}".format(id_str, m["URL"])

        if scores.get("cutoff") is not None:
            return False, "{} => Cut for {{ {} }} score = {}".format(
                id_str, scores["cutoff"], scores)
        else:
            return True, ("{} => {}".format(id_str, scores))

    def post_process_one(self, m, search_name, get_details=False, force=False):
        house_details = None
        if get_details:
            house_details = self.rfapi.get_house_details(m,
                                                         force=force,
                                                         cache_time_format="")

        #logging.debug("post_process(get_details={})=>{}".format(get_details, house_details))
        RFAPI.generate_url_for_house(m)
        ha = RFAPI.house_address_parts(m)
        scores = None
        try:
            scores = self.get_scores(m, house_details)
        except:
            logging.error("house : {}, throw {}".format(
                json.dumps(m), traceback.format_exc()))
        if scores is None:
            return m

        m['scores'] = scores
        is_good, message = HouseScore.get_house_score_message(m)

        if is_good:
            if house_details is None:
                house_details = self.rfapi.get_house_details(
                    m, force=force, cache_time_format="")
                scores = self.get_scores(m, house_details)
            house_neighborhoods = RFAPI.house_neighborhoods(m, house_details)
            m['scores'] = scores
            is_good, message = HouseScore.get_house_score_message(m)

            logging.info(message)
            # getting city data takes a long time, will do it only for winning houses!
            """
            self.city_data.get_data(
                house_address= ha['display']
                , city=ha['city']     # Everett
                , state_short=ha['state']
                , zip_code=ha['zip']
                , house_neighborhoods=house_neighborhoods
                , force=force)
            """
        else:
            logging.debug(message)

        return m

    def post_process(self,
                     matches,
                     search_name,
                     get_details=False,
                     force=False):
        good_ones = 0

        with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
            executor.map(
                lambda m: self.post_process_one(m,
                                                search_name=search_name,
                                                get_details=get_details,
                                                force=force), matches)

        logging.debug("Done parallel processing")

        for m in tqdm(matches):
            #     m = self.post_process_one(m, search_name=search_name, get_details=get_details, force=force)

            is_good, message = HouseScore.get_house_score_message(m)
            if is_good:
                good_ones = good_ones + 1

        logging.debug("{} Matches={}/{}".format(search_name, good_ones,
                                                len(matches)))
        return matches

    @staticmethod
    def _plot_groups(X, y_pred):
        colors = np.array(
            list(
                islice(
                    cycle([
                        '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628',
                        '#984ea3', '#999999', '#e41a1c', '#dede00'
                    ]), int(max(y_pred) + 1))))
        plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
        plt.show()

    @staticmethod
    def ClusterHouses(matches, plot_groups=False):
        groups = {}
        try:
            N = len(matches)
            X = np.zeros((N, 2))
            for m in range(N):
                loc = RFAPI.house_location(matches[m])
                #logging.debug("ClusterHouses({})".format(loc))
                X[m] = (loc[0], loc[1])

            params = {
                'quantile': .3,
                'eps': .15,
                'damping': .9,
                'preference': -5,
                'n_neighbors': 2,
                'n_clusters': 5
            }

            # a bit buggy..
            spectral = cluster.SpectralClustering(
                n_clusters=params['n_clusters'],
                eigen_solver='arpack',
                affinity="nearest_neighbors")

            # best so far!
            gmm = mixture.GaussianMixture(n_components=params['n_clusters'],
                                          covariance_type='full')

            # yielded one cluster..
            affinity_propagation = cluster.AffinityPropagation(
                damping=params['damping'], preference=params['preference'])

            bandwidth = cluster.estimate_bandwidth(X,
                                                   quantile=params['quantile'])
            ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)

            algorithm = ms

            algorithm.fit(X)
            if hasattr(algorithm, 'labels_'):
                y_pred = algorithm.labels_.astype(np.int)
            else:
                y_pred = algorithm.predict(X)
            for m in range(len(matches)):
                key = str(y_pred[m])
                if groups.get(key, None) == None:
                    groups[key] = []

                groups[key].append({
                    "adress": RFAPI.house_address(matches[m]),
                    "location": [X[m][0], X[m][1]]
                })
            logging.debug("groups = {}".format(groups))
            if plot_groups:
                HouseScore._plot_groups(X, y_pred)
        except Exception as e:
            groups["error"] = str(e)
            logging.error(groups["error"])
        return groups

    @staticmethod
    def filter_good_houses(houses):
        return [m for m in houses if m['scores'].get("cutoff") is None]

    @staticmethod
    def add_html_tab(tab_id, tab_name, tab_content, tabs):
        is_first_tab = False
        tab_template = '<button class="tablinks {2}" onclick="openTab(event, \'{0}\')">{1}</button>'
        tab_content_template = '<div id="{0}" class="tabcontent">{1}</div>'

        if tabs is None:
            tabs = {"tabs": ['<div class="tab">', '</div>'], "contents": []}
            is_first_tab = True

        tabs['tabs'].insert(
            len(tabs['tabs']) - 1,
            tab_template.format(tab_id, tab_name,
                                'defaultOpen' if is_first_tab else ""))
        tabs['contents'].append(
            tab_content_template.format(tab_id, tab_content))
        return tabs

    @staticmethod
    def get_house_summary(house, rank=0):
        score = house.get("scores")
        if score is None:
            logging.warning("score is missing for {}".format(house))
            return {}
        first_listed = datetime.datetime.today() - datetime.timedelta(
            days=score["history"]["value"])
        house_summary = {
            "Address":
            score["facts"]["full_address"],
            "score":
            score["value"]["value"],
            "distance":
            score["distance"]["value"],
            "County":
            score["facts"]["County"],
            "Year Build":
            score["facts"]["build"],
            "beds":
            score["facts"]["beds"],
            "baths":
            score["facts"]["baths"],
            "price":
            score["facts"]["price"],
            "sqft":
            score["facts"]["sqft"],
            "lot size":
            score["facts"]["sqft"] + score["backyard"]["value"],
            "first_listed":
            first_listed.strftime("%m/%d/%Y"),
            "dom":
            score["history"]["message"],
            "url":
            "http://www.redfin.com{}".format(house["URL"]),
            "picture":
            score["facts"]["photo"],
            "price_for_sf":
            score["facts"]["price"] /
            score["facts"]["sqft"] if score["facts"]["sqft"] > 0 else 0.0,
            "user_rank":
            rank
        }
        return house_summary

    @staticmethod
    def get_houses_category_html(houses, summary="", category_id=""):
        html_content = []
        #logging.debug("houses {}".format(len(houses)))
        even_raw = False
        house_id = 0
        for house in houses:
            house_id += 1

            even_raw = not even_raw
            score = house.get("scores")
            if score is None:
                logging.warning("score is missing for {}".format(house))
                continue
            house_summary_raw = HouseScore.get_house_summary(house)
            house_summary = {
                "Address": house_summary_raw["Address"],
                "score": "{:.2f}".format(house_summary_raw["score"]),
                "distance": "{:.2f}".format(house_summary_raw["distance"]),
                "County": house_summary_raw["County"],
                "Year Build": house_summary_raw["Year Build"],
                "beds": house_summary_raw["beds"],
                "baths": house_summary_raw["baths"],
                "price": house_summary_raw["price"],
                "sqft": house_summary_raw["sqft"],
                "lot size": house_summary_raw["lot size"],
                "dom": house_summary_raw["dom"],
                "$/sf": "{:.0f}".format(house_summary_raw["price_for_sf"])
            }

            is_good, message = HouseScore.get_house_score_message(house)
            if is_good:
                pass_message = "Good ( score = {:.2f}, distance = {:.2f} )".format(
                    score["value"]["value"], score["distance"]["value"])
            else:
                house_summary["Failed"] = "[ {} ]".format(score["cutoff"])
                pass_message = "Failed [ {} ] ( score = {:.2f}, distance = {:.2f} )".format(
                    score["cutoff"], score["value"]["value"],
                    score["distance"]["value"])

            house_summary_html = DicToTHML(house_summary)
            score_html = DicToTHML(score)

            # build taps
            tabs = None
            house_group_id = "{}_{}".format(category_id, house_id)
            tabs = HouseScore.add_html_tab(
                tab_id="summary_{}".format(house_group_id),
                tab_name="Summary",
                tab_content=house_summary_html,
                tabs=tabs)
            tabs = HouseScore.add_html_tab(
                tab_id="details_{}".format(house_group_id),
                tab_name="Details",
                tab_content=score_html,
                tabs=tabs)

            maps_url = "https://www.google.com/maps/place/{}".format(
                score["facts"]["full_address"].replace(' ', '+'))
            areavibes_url = "https://www.areavibes.com/{}-{}/livability/".format(
                house['address_data']['city'].replace('-',
                                                      '+').replace(' ', '+'),
                house['address_data']['state'])
            spotcrime_url = "https://spotcrime.com/#{}".format(
                score["facts"]["full_address"].replace(' ', '%20').replace(
                    '-', '%20').replace(',', '%2C'))

            tabs = HouseScore.add_html_tab(
                tab_id="links_{}".format(house_group_id),
                tab_name="Links",
                tab_content="""
                <H3><A href="{}">Map</A></H3>
                <H3><A href="{}">Areavibes</A></H3>
                <H3><A href="{}">SpotCrime</A></H3>
                """.format(maps_url, areavibes_url, spotcrime_url),
                tabs=tabs)
            #
            #<IFRAME width='100%' height='500' src='https://spotcrime.com/"+spotcrime_sub_path+"'/>"
            tabs_html = "\n".join(tabs['tabs'] + tabs['contents'])
            this_house_report = r"""
        <TR {4}> <!-- draggable="true" //-->
            <TD align="center" valign="top" >
                <TABLE width="100%">
                    <TR><TD colspan="2" width="100%">
                        <H2><A href="http://www.redfin.com{1}">{2}</A></H2>
                    </TD></TR>
                    <TR><TD width="50%" align="center" valign="top" >
                        <A href="http://www.redfin.com{1}"><IMG width="100%" src="{0}" /></A><BR/>
                        <P>Details : {5}</P>
                    </TD><TD align="left" valign="top" width="50%">
                        {3}
                    </TD></TR>
                </TABLE>
            </TD>
        </TR>
                    """.format(
                score["facts"]["photo"]  #0
                ,
                house["URL"]  #1
                ,
                score["facts"]["full_address"]  #2
                ,
                tabs_html  #3
                ,
                ('class="one-house-dragable page-break"' if even_raw else
                 'class="one-house-dragable no-page-break"')  #4
                ,
                pass_message)

            html_content.append(this_house_report)

        if len(html_content) == 0:
            return ""

        category_template = """
        <H1>{$SUMMARY}</H1>
        <BR/>
        <TABLE width="90%" align="center" valign="top">
            {$ACCORDION_TEMPLATE_BODY}
        </TABLE>
        """

        category_template = category_template.replace("{$SUMMARY}", summary)
        category_template = category_template.replace(
            "{$ACCORDION_TEMPLATE_BODY}", "\n".join(html_content))

        return category_template

    @staticmethod
    def search_for_range(matches,
                         active_only=True,
                         min_price=0,
                         max_price=7600000,
                         only_good=True):
        good_houses = matches
        if active_only:
            good_houses = [
                m for m in good_houses
                if m.get('status') is None or m['status'] == 'Active'
            ]
        if only_good:
            good_houses = [
                m for m in good_houses if m['scores'].get("cutoff") is None
            ]
        good_houses = [
            m for m in good_houses
            if m['price'] >= min_price and m['price'] <= max_price
        ]
        sorted_good_houses = HouseScore.sort_by_total_score_vs_price(
            good_houses)
        return sorted_good_houses

    @staticmethod
    def get_houses_html(houses, title="", active_only=True, only_good=True):
        tabs = None
        filtered_matches = HouseScore.search_for_range(houses,
                                                       active_only=active_only,
                                                       min_price=490000,
                                                       max_price=760000,
                                                       only_good=only_good)
        tab_content = HouseScore.get_houses_category_html(
            filtered_matches,
            category_id="full",
            summary="Found ( {} / {} ) good houses".format(
                len(filtered_matches), len(houses)))
        tabs = HouseScore.add_html_tab(tab_id="full_id",
                                       tab_name="All Houses",
                                       tab_content=tab_content,
                                       tabs=tabs)

        filtered_matches = HouseScore.search_for_range(houses,
                                                       active_only=active_only,
                                                       min_price=490000,
                                                       max_price=610000,
                                                       only_good=only_good)
        tab_content = HouseScore.get_houses_category_html(
            filtered_matches,
            category_id="low",
            summary="Found ( {} / {} ) good houses".format(
                len(filtered_matches), len(houses)))
        tabs = HouseScore.add_html_tab(tab_id="low_id",
                                       tab_name="Low",
                                       tab_content=tab_content,
                                       tabs=tabs)

        filtered_matches = HouseScore.search_for_range(houses,
                                                       active_only=active_only,
                                                       min_price=590000,
                                                       max_price=710000,
                                                       only_good=only_good)
        tab_content = HouseScore.get_houses_category_html(
            filtered_matches,
            category_id="med",
            summary="Found ( {} / {} ) good houses".format(
                len(filtered_matches), len(houses)))
        tabs = HouseScore.add_html_tab(tab_id="med_id",
                                       tab_name="Medium",
                                       tab_content=tab_content,
                                       tabs=tabs)

        filtered_matches = HouseScore.search_for_range(houses,
                                                       active_only=active_only,
                                                       min_price=690000,
                                                       max_price=760000,
                                                       only_good=only_good)
        tab_content = HouseScore.get_houses_category_html(
            filtered_matches,
            category_id="high",
            summary="Found ( {} / {} ) good houses".format(
                len(filtered_matches), len(houses)))
        tabs = HouseScore.add_html_tab(tab_id="high_id",
                                       tab_name="High",
                                       tab_content=tab_content,
                                       tabs=tabs)

        with open(os.path.join(SCRIPT_DIR, "report_template.html"),
                  "r") as html_template_stream:
            html_template = html_template_stream.read()

        html_template = html_template.replace("{$TITLE}", title)
        html_template = html_template.replace("{$TABS}",
                                              "\n".join(tabs['tabs']))
        html_template = html_template.replace("{$TAB_CONTENTS}",
                                              "\n".join(tabs['contents']))

        return html_template

    @staticmethod
    def sort_by_total_score_vs_price(good_houses):
        cost_gain = {}
        for m in good_houses:
            gain = m['scores']['value']["money"] - m['scores']['facts']['price']
            gain_percentage = m['scores']['value']["value"]
            if cost_gain.get(gain_percentage) is None:
                cost_gain[gain_percentage] = []
            cost_gain[gain_percentage].append(m)

        sorted_keys = sorted(cost_gain, reverse=True)
        #logging.debug(sorted_keys)
        retVal = []
        for k in sorted_keys:
            for m in cost_gain[k]:
                #m['scores']['gain_percentage'] = k
                retVal.append(m)
        return retVal

    def SearchByUrl(self,
                    house_url,
                    get_details=True,
                    force=False,
                    cache_time_format="%Y%m%d"):
        house, details = self.rfapi.get_house_by_url(
            house_url, force=force, cache_time_format=cache_time_format)
        return self.post_process([house],
                                 "SearchByUrl({})".format(house_url),
                                 get_details=get_details,
                                 force=force)

    def Search(self, search_name, search_json, get_details=False, force=False):
        logging.debug(
            "Search(search_name={}, search_json={}, get_details={}, force={})".
            format(search_name, search_json, get_details, force))
        matches = self.rfapi.retrieve_json(search_json, force=force)
        return self.post_process(matches,
                                 search_name,
                                 get_details=get_details,
                                 force=force)

    def SearchForZIPCodes(self,
                          search_name,
                          search_json,
                          zip_codes,
                          get_details=False,
                          force=False):
        zip_regions = [
            self.rfapi.zipcode_to_regionid(zipcode, False)
            for zipcode in zip_codes
        ]
        # need to convert to region_id
        region_types = [2 for zipcode in zip_codes]
        all_matches = []
        for region in zip_regions:
            search_json["region_id"] = [region]
            matches = self.rfapi.retrieve_json(search_json, force=force)
        all_matches += matches
        return self.post_process(all_matches,
                                 search_name,
                                 get_details=get_details,
                                 force=force)

    @staticmethod
    def LoadFavorits(FavoriteLocationsFile):
        with open(FavoriteLocationsFile, "r") as stream:
            fav = json.load(stream)
            ret = []
            for v in fav:
                if v['importance'] != 0:
                    ret.append(v)
            return ret
Exemplo n.º 24
0
 def __init__(self, token):
     self.rfapi = RFAPI(token)
Exemplo n.º 25
0
      {
        "not": {
          "ip": "127.0.0.1"
        }
      },
      {
        "not": {
          "ip": "0.0.0.0"
        }
      }
    ],
    "limit": 10
  },
  "output": {
    "exclude": [
      "stats"
    ],
    "inline_entities": True
  }
}

# Using RFAPI module, run query
# Note: To pull back all results, use rfqapi.paged_query(q)
# and a higher limit. 
rfqapi = RFAPI(token)
result = rfqapi.query(q)
 
# Display the results (in this case, limit is 1)    
for res in result['events']:
    print "Event: \n"
    print str(res) + '\n'
Exemplo n.º 26
0
class IOCEnricher(object):
    '''Enriches a list of IOCs with data from Recorded Future.
    '''

    _VALID_TYPES = ["IpAddress", "Hash", "InternetDomainName"]
    _INSTANCES_OR_DOCUMENTS = 'instances'
    _MALICIOUS_INDICATORS = [
        "compromised", "malicious", "suspected", "threat", "malware",
        "infected", "honeypot", "attacked from", "exploit", "attacks from",
        "bad http request from", "attack detected", "attack deteted"
    ]
    _RELATED_ENTITY_TYPES = [
        'Malware', 'CyberVulnerability', 'IpAddress', 'Hash',
        'InternetDomainName'
    ]
    # can be "document" also, but enrichment will take much longer
    # to pull document-level co-entities. fragment-level will use
    # extended_entities where available
    _RELATED_ENTITY_SCOPE = "fragment"
    _FEATURES = {
        "debug":
        collections.OrderedDict([("RFID", ""), ("EntityType", ""),
                                 ("TotalHits", 0), ("7DayHits", 0),
                                 ("1DayHits", 0), ("MaliciousHits", 0),
                                 ("InfoSecHits", 0), ("PasteHits", 0),
                                 ("SocialMediaHits", 0)]),
        "related":
        collections.OrderedDict([("RelatedMalware", []),
                                 ("RelatedCyberVulnerability", []),
                                 ("RelatedIpAddress", []),
                                 ("RelatedInternetDomainName", []),
                                 ("RelatedHash", []),
                                 ("RelatedMalwareCount", 0),
                                 ("RelatedCyberVulnerabilityCount", 0),
                                 ("RelatedIpAddressCount", 0),
                                 ("RelatedInternetDomainNameCount", 0),
                                 ("RelatedHashCount", 0), ("Score", 0.0)]),
        "core":
        collections.OrderedDict([("Name", ""), ("RFURL", ""),
                                 ("MostRecent", ""), ("MostRecentSource", ""),
                                 ("MostRecentTitle", ""),
                                 ("MostRecentFragment", ""),
                                 ("MostRecentURL", ""),
                                 ("RecentInfoSecSource", ""),
                                 ("RecentInfoSecTitle", ""),
                                 ("RecentInfoSecFragment", ""),
                                 ("RecentInfoSecURL", ""),
                                 ("RecentPasteSource", ""),
                                 ("RecentPasteTitle", ""),
                                 ("RecentPasteFragment", ""),
                                 ("RecentPasteURL", ""),
                                 ("RecentSocialMediaSource", ""),
                                 ("RecentSocialMediaTitle", ""),
                                 ("RecentSocialMediaFragment", ""),
                                 ("RecentSocialMediaURL", ""),
                                 ("FirstSource", ""), ("FirstTitle", ""),
                                 ("FirstFragment", ""), ("FirstURL", ""),
                                 ("FirstPublished", "")])
    }

    def __init__(self, token, iocs, entity_type, mode='core'):
        '''
        Parameters
        ----------
        token : str
            Recorded Future API token
        iocs : list or dict
            List of IOCs to enrich or dict of IOCs keyed by name with the value as the RFID. 
        entity_type : {"IpAddress", "Hash", "InternetDomainName"}
            Name of Recorded Future entity type for IOC.
        mode : {"core", "related", "debug"}
            Subset of features to return with enrichment. "core" is default.
        '''
        self.rfqapi = RFAPI(token)
        self.response = collections.OrderedDict()
        # need all features early for scoring; they're removed later
        # need to test whether this can be avoided
        keys = self._FEATURES['core']
        keys.update(self._FEATURES['debug'])
        if mode in ('related', 'debug'):
            keys.update(self._FEATURES['related'])
        if mode not in ('core', 'related', 'debug'):
            raise ValueError(
                '"mode" must be one of ("core", "related", "debug"). Input: %s.'
                % mode)
        self.mode = mode
        self.entity_type = entity_type
        if isinstance(iocs, list):
            self.iocs = self._get_rfids(iocs)
        elif isinstance(iocs, dict):
            self.iocs = iocs
        else:
            raise ValueError('"iocs" must be list or dict.')
        for ioc in self.iocs:
            new_resp = {}
            for key in keys:
                new_resp[key] = keys[key]
                if key == 'Name':
                    new_resp[key] = ioc
                elif key == 'RFID':
                    new_resp[key] = self.iocs[ioc]
                elif key == 'EntityType':
                    new_resp[key] = self.entity_type
            self.response[ioc] = new_resp
        self.keys = keys

    def get_keys(self, mode=None):
        '''Getter for the keys in the response.
        '''
        return [
            key for key in self.keys
            if key not in self._get_extra_features(mode)
        ]

    def _get_extra_features(self, mode=None):
        if not mode:
            mode = self.mode
        extra_features = []
        if mode in ('core', 'related'):
            extra_features = self._FEATURES['debug'].keys()
        return extra_features

    def enrich(self):
        '''Enriches the given IOC.
        Returns
        -------
        response : dict
            The enrichment package containing all keys requested by "mode" parameter.
        '''
        print "    Getting all references"
        max_index = None
        for names in _chunks(self.iocs.keys(), 250):
            refs, edetails = self._get_all_references(names)
            print "      Getting enrichment from references"
            max_index_cand = self._get_enrichment(refs, edetails)
            if max_index_cand < max_index or not max_index:
                # using < here because the references are no longer retrieved all from
                # the same query, so there may be timings, so we're looking at the minimax
                max_index = max_index_cand
            print "      Getting URL and Score"
        for ioc in self.response:
            ioc_resp = self.response[ioc]
            # Get RF URL
            if 'RFURL' in ioc_resp:
                ioc_resp['RFURL'] = _generate_rfURL_from_entity(
                    ioc, ioc_resp.get('RFID', None))
            # Score the ref
            if 'Score' in ioc_resp:
                self.score(ioc_resp)
            # Remove unnecessary features
            extra_features = self._get_extra_features()
            for key in extra_features:
                del ioc_resp[key]
        return self.response, max_index

    def score(self, ioc_resp):
        spec_keys = ('7DayHits', '1DayHits')
        nonzero_keys = ('MaliciousHits', 'InfoSecHits', 'PasteHits',
                        'RelatedMalwareCount',
                        'RelatedCyberVulnerabilityCount',
                        'RelatedIpAddressCount',
                        'RelatedInternetDomainNameCount', 'RelatedHashCount')
        max_score = 0.0
        # score special keys
        if 'TotalHits' in self.keys:
            for key in filter(lambda k: k in self.keys, spec_keys):
                if ((ioc_resp[key] * 2) > ioc_resp["TotalHits"]):
                    ioc_resp['Score'] += 1
                max_score += len(spec_keys)
        # score nonzero keys
        for key in filter(lambda k: k in self.keys, nonzero_keys):
            if ioc_resp[key] > 0:
                ioc_resp['Score'] += 1
            max_score += 1
        ioc_resp['Score'] = ioc_resp['Score'] / max_score

    def _get_enrichment(self, refs, edetails):
        max_index = None
        today = datetime.datetime.today()
        one_day_hit_string = _rfid_date_conv(today -
                                             datetime.timedelta(days=1))
        seven_day_hit_string = _rfid_date_conv(today -
                                               datetime.timedelta(days=7))

        # first get everything from all references
        print "    Processing references"
        ioc_to_rfid = self.iocs
        rfid_to_ioc = {}
        for ioc in filter(lambda i: ioc_to_rfid[i], ioc_to_rfid):
            rfid_to_ioc[ioc_to_rfid[ioc]] = ioc
        recent_pub = {
            "MostRecent": {},
            "Paste": {},
            "InfoSec": {},
            "SocialMedia": {}
        }
        first_pub = {}
        for ref in refs:
            indexed = ref['document']['indexed']
            if indexed > max_index or not max_index:
                max_index = indexed
            fragment = ref['fragment'].lower()
            attrs = ref['attributes']
            source_topic = ref['document']['sourceId'].get('topic', None)
            source_media_type = ref['document']['sourceId'].get(
                'media_type', None)
            pub_date = ref['document']['published']
            # get entities mentioned
            rfids = filter(lambda ioc: ioc in rfid_to_ioc,
                           attrs.get('entities', []))
            ioc_rfids = [rfid for rfid in rfids if rfid in rfid_to_ioc]
            # get string hits that aren't included in the entity hits
            other_hits = [
                ioc for ioc in ioc_to_rfid
                if (ioc in fragment and ioc_to_rfid[ioc] not in ioc_rfids)
            ]
            # increment hit counts and get recent hits
            iocs = [rfid_to_ioc[rfid] for rfid in ioc_rfids]
            for ioc in iocs + other_hits:
                ioc_resp = self.response[ioc]
                # update dates
                recent_pub['MostRecent'][ioc] = self._safe_update_date(
                    ioc_resp, pub_date, recent_pub['MostRecent'][ioc]
                    if ioc in recent_pub['MostRecent'] else '', 'MostRecent',
                    pub_date > recent_pub['MostRecent'][ioc]
                    if ioc in recent_pub['MostRecent']
                    and len(recent_pub['MostRecent'][ioc]) > 0 else True)
                first_pub[ioc] = self._safe_update_date(
                    ioc_resp, pub_date,
                    first_pub[ioc] if ioc in first_pub else '',
                    'FirstPublished', pub_date < first_pub[ioc]
                    if ioc in first_pub and len(first_pub[ioc]) > 0 else True)
                # update hit counters
                self._safe_update_hits(ioc_resp, 'TotalHits', True)
                self._safe_update_hits(ioc_resp, '1DayHits',
                                       pub_date >= one_day_hit_string)
                self._safe_update_hits(ioc_resp, '7DayHits',
                                       pub_date >= seven_day_hit_string)
                self._safe_update_hits(
                    ioc_resp, 'MaliciousHits',
                    any(term in fragment
                        for term in self._MALICIOUS_INDICATORS))
                # update hit counters and references
                conditions = {
                    "InfoSec": source_topic == 'KPzZAE',
                    "Paste": source_media_type == 'KDS1Zp',
                    "SocialMedia": source_media_type == 'JxSEtC'
                }
                for key in conditions:
                    condition = conditions[key]
                    recent_pub[key][ioc] = self._safe_update_hits_and_refs(
                        ioc_resp, ref, key, condition,
                        recent_pub[key][ioc] if ioc in recent_pub[key] else '',
                        pub_date > recent_pub[key][ioc]
                        if ioc in recent_pub[key]
                        and len(recent_pub[key][ioc]) > 0 else True)
                # update references for first and recent
                self._safe_update_refs(
                    ioc_resp, ref, 'MostRecent',
                    pub_date == recent_pub['MostRecent'][ioc])
                self._safe_update_refs(ioc_resp, ref, 'First',
                                       pub_date == first_pub[ioc])
        # get related content at fragment scope
        if self.mode in ('debug', 'related'
                         ) and self._RELATED_ENTITY_SCOPE == 'fragment':
            self._safe_get_related_entities_from_frags(refs, edetails)
        # get related content at document scope
        if self.mode in ('debug', 'related'
                         ) and self._RELATED_ENTITY_SCOPE == 'document':
            # print "Getting related content from documents"
            docs = self._get_docs()
            self._safe_get_related_entities_from_docs(docs)
        return max_index

    def _safe_update_hits_and_refs(self, ioc_resp, ref, key, condition,
                                   cur_date, date_condition):
        pub_date = ref['document']['published']
        date_update = self._safe_update_date(ioc_resp, pub_date, cur_date, key,
                                             date_condition and condition)
        if condition:
            # update hits
            self._safe_update_hits(ioc_resp, key + 'Hits', condition)
            # get recent frags
            self._safe_update_refs(ioc_resp, ref, 'Recent' + key,
                                   pub_date == date_update)
        return date_update

    def _safe_update_date(self, ioc_resp, date, existing_val, key, condition):
        if condition and key in ioc_resp:
            ioc_resp[key] = date
        return date if condition else existing_val

    def _safe_update_hits(self, ioc_resp, key, condition):
        if condition and key in ioc_resp:
            ioc_resp[key] += 1

    def _safe_update_refs(self, ioc_resp, ref, key, condition):
        if condition:
            key_suffixes = {
                'Source':
                ref['document']['sourceId']['name'].replace('\n', ' ').replace(
                    '\r', ' '),
                'Title':
                ref['document']['title'].replace('\n', ' ').replace('\r', ' '),
                'Fragment':
                ref['fragment'].replace('\n', ' ').replace('\r', ' '),
                'URL':
                ref['document']['url'] if 'url' in ref['document'] else ''
            }
            for suffix in filter(lambda suf: key + suf in ioc_resp,
                                 key_suffixes):
                ioc_resp[key + suffix] = key_suffixes[suffix]

    def _get_all_references(self, names):
        refs = []
        seen_ids = set()
        edetails = {}
        q = {
            "instance": {
                "type": "Event",
                "limit": 25000,
                "searchtype": "scan"
            }
        }
        q['instance']['attributes'] = [[{
            "name": "Event.event_fragment",
            'string': names
        }]]
        rfids = [self.iocs[name] for name in names if self.iocs[name]]
        q['instance']['attributes'][0].append({
            "name": "entities",
            "entity": {
                "id": rfids
            }
        })
        # print len(self.iocs.keys()),
        for res in self.rfqapi.paged_query(q):
            refs.extend([
                inst for inst in res['instances'] if inst['id'] not in seen_ids
            ])
            seen_ids.update([inst['id'] for inst in res['instances']])
            edetails.update({
                eid: res['entities'][eid]
                for eid in res['entities']
                if res['entities'][eid]['type'] in self._RELATED_ENTITY_TYPES
            })
        return refs, edetails

    def _get_docs(self):
        all_docs = set()
        for names in _chunks(self.iocs.keys(), 250):
            q = {
                "instance": {
                    "type": "Event"
                },
                "output": {
                    "count": {
                        "axis": [{
                            "name": "attributes.entities",
                            "type": [self.entity_type],
                            "aspect": "name"
                        }, "document"],
                        "values": [self._INSTANCES_OR_DOCUMENTS]
                    }
                }
            }
            q['instance']['attributes'] = [[{
                "name": "Event.event_fragment",
                'string': names
            }]]
            rfids = [self.iocs[name] for name in names if self.iocs[name]]
            q['instance']['attributes'][0].append({
                "name": "entities",
                "entity": {
                    "id": rfids
                }
            })
            res = self.rfqapi.query(q)
            counts = res["counts"][0]
            if len(counts) != 0:
                for ioc in filter(lambda i: i in self.iocs, counts):
                    docids = counts[ioc].keys()
                    self.response[ioc]['DocumentIds'] = docids
                    all_docs.update(docids)
        return list(all_docs)

    def _safe_get_related_entities_from_frags(self, refs, edetails):
        ioc_to_rfid = self.iocs
        rfid_to_ioc = {}
        for ioc in filter(lambda i: ioc_to_rfid[i], ioc_to_rfid):
            rfid_to_ioc[ioc_to_rfid[ioc]] = ioc
        entities_to_lookup = set()
        for ref in refs:
            related_ents = ref['attributes'].get(
                'extended_entities', ref['attributes'].get('entities', []))
            entities_to_lookup.update(
                [eid for eid in related_ents if eid not in edetails])
        # print "Updating entity resolution"
        edetails.update(
            self._resolve_related_entities(list(entities_to_lookup)))
        # print "Updated related entities"
        for ref in refs:
            fragment = ref['fragment'].lower()
            # get related entities from reference
            related_ents = ref['attributes'].get(
                'extended_entities', ref['attributes'].get('entities', []))
            # get entities mentioned
            rfids = filter(lambda ioc: ioc in rfid_to_ioc, related_ents)
            ioc_rfids = [rfid for rfid in rfids if rfid in rfid_to_ioc]
            # get string hits that aren't included in the entity hits
            other_hits = [
                ioc for ioc in ioc_to_rfid
                if (ioc in fragment and ioc_to_rfid[ioc] not in ioc_rfids)
            ]
            iocs = [rfid_to_ioc[rfid] for rfid in ioc_rfids]
            for ioc in iocs + other_hits:
                ioc_resp = self.response[ioc]
                for ent in filter(
                        lambda eid: eid in edetails and eid != ioc_resp[
                            'RFID'], related_ents):
                    etype, name = edetails[ent]['type'], edetails[ent]['name']
                    if name not in ioc_resp['Related' + etype]:
                        ioc_resp['Related' + etype].append(name)
        for ioc in self.response:
            ioc_resp = self.response[ioc]
            for etype in self._RELATED_ENTITY_TYPES:
                if 'Related' + etype + 'Count' in ioc_resp:
                    ioc_resp['Related' + etype + 'Count'] = len(
                        ioc_resp['Related' + etype])
                if 'Related' + etype not in self.keys and 'Related' + etype in ioc_resp:
                    del ioc_resp['Related' + etype]

    def _resolve_related_entities(self, eids):
        if len(eids) == 0:
            return {}
        results = {}
        for ents in _chunks(eids, 250):
            q = {"entity": {"id": ents, "limit": 1001}}
            res = self.rfqapi.query(q)
            results.update({
                eid: res['entity_details'][eid]
                for eid in res['entity_details'] if res['entity_details'][eid]
                ['type'] in self._RELATED_ENTITY_TYPES
            })
        return results

    def _safe_get_related_entities_from_docs(self, docs):
        for docids in _chunks(docs, 250):
            q = {
                "instance": {
                    "type": "Event",
                    "document": {
                        "id": docids
                    }
                },
                "output": {
                    "count": {
                        "axis": [
                            "document", {
                                "name": "attributes.entities",
                                "type": self._RELATED_ENTITY_TYPES,
                                "aspect": "all"
                            }
                        ],
                        "values": [self._INSTANCES_OR_DOCUMENTS]
                    }
                }
            }
            res = self.rfqapi.query(q)
            counts = res['counts'][0]
            for ioc in self.response:
                ioc_resp = self.response[ioc]
                for docid in filter(lambda did: did in counts,
                                    ioc_resp['DocumentIds']):
                    for asp_name in filter(lambda n: n != 'NONE',
                                           counts[docid]):
                        name, unused, etype = rf_agg_name_parser(asp_name)
                        if name == ioc: continue
                        # update related counts
                        if name not in ioc_resp['Related' + etype]:
                            ioc_resp['Related' + etype].append(name)
        for ioc in self.response:
            ioc_resp = self.response[ioc]
            if 'DocumentIds' not in self.keys and 'DocumentIds' in ioc_resp:
                del ioc_resp['DocumentIds']
            for etype in self._RELATED_ENTITY_TYPES:
                if 'Related' + etype + 'Count' in ioc_resp:
                    ioc_resp['Related' + etype + 'Count'] = len(
                        ioc_resp['Related' + etype])
                if 'Related' + etype not in self.keys and 'Related' + etype in ioc_resp:
                    del ioc_resp['Related' + etype]

    def _get_rfids(self, iocs):
        new_iocs = collections.OrderedDict()
        edetails = {}
        for names in _chunks(iocs, 250):
            if len(names) == 0: continue
            q = {
                "entity": {
                    "name": names,
                    "type": self.entity_type,
                    "limit": 501
                }
            }
            res = self.rfqapi.query(q)
            if len(res['entities']) == 0: continue
            for ent in res['entities']:
                edetails[res['entity_details'][ent]['name']] = ent
        for ioc in iocs:
            new_iocs[ioc] = edetails[ioc] if ioc in edetails else None
        return new_iocs
Exemplo n.º 27
0
class IOCEnricher(object):
    '''Enriches a list of IOCs with data from Recorded Future.
    '''

    _VALID_TYPES = ["IpAddress",
                    "Hash",
                    "InternetDomainName"]
    _INSTANCES_OR_DOCUMENTS = 'instances'
    _MALICIOUS_INDICATORS = ["compromised",
                             "malicious",
                             "suspected", 
                             "threat", 
                             "malware", 
                             "infected", 
                             "honeypot", 
                             "attacked from", 
                             "exploit", 
                             "attacks from", 
                             "bad http request from", 
                             "attack detected", 
                             "attack deteted"]
    _RELATED_ENTITY_TYPES = ['Malware',
                             'CyberVulnerability',
                             'IpAddress',
                             'Hash',
                             'InternetDomainName']
    # can be "document" also, but enrichment will take much longer
    # to pull document-level co-entities. fragment-level will use
    # extended_entities where available
    _RELATED_ENTITY_SCOPE = "fragment"
    _FEATURES = {"debug": collections.OrderedDict([("RFID", ""),
                                                   ("EntityType", ""),
                                                   ("TotalHits", 0),
                                                   ("7DayHits", 0), 
                                                   ("1DayHits", 0),
                                                   ("MaliciousHits", 0), 
                                                   ("InfoSecHits", 0),
                                                   ("PasteHits", 0), 
                                                   ("SocialMediaHits", 0)]),
                 "related": collections.OrderedDict([("RelatedMalware", []),
                                                     ("RelatedCyberVulnerability", []), 
                                                     ("RelatedIpAddress", []), 
                                                     ("RelatedInternetDomainName", []),
                                                     ("RelatedHash", []),
                                                     ("RelatedMalwareCount", 0), 
                                                     ("RelatedCyberVulnerabilityCount", 0),
                                                     ("RelatedIpAddressCount", 0), 
                                                     ("RelatedInternetDomainNameCount", 0),
                                                     ("RelatedHashCount", 0),
                                                     ("Score", 0.0)]),
                 "core": collections.OrderedDict([("Name", ""),
                                                  ("RFURL", ""),
                                                  ("MostRecent", ""),
                                                  ("MostRecentSource", ""),
                                                  ("MostRecentTitle", ""),
                                                  ("MostRecentFragment", ""),
                                                  ("MostRecentURL", ""),
                                                  ("RecentInfoSecSource", ""), 
                                                  ("RecentInfoSecTitle", ""),
                                                  ("RecentInfoSecFragment", ""), 
                                                  ("RecentInfoSecURL", ""), 
                                                  ("RecentPasteSource", ""), 
                                                  ("RecentPasteTitle", ""), 
                                                  ("RecentPasteFragment", ""), 
                                                  ("RecentPasteURL", ""),
                                                  ("RecentSocialMediaSource", ""), 
                                                  ("RecentSocialMediaTitle", ""), 
                                                  ("RecentSocialMediaFragment", ""), 
                                                  ("RecentSocialMediaURL", ""), 
                                                  ("FirstSource", ""), 
                                                  ("FirstTitle", ""), 
                                                  ("FirstFragment", ""), 
                                                  ("FirstURL", ""), 
                                                  ("FirstPublished", "")])}
    
    def __init__(self, token, iocs, entity_type, mode='core'):
        '''
        Parameters
        ----------
        token : str
            Recorded Future API token
        iocs : list or dict
            List of IOCs to enrich or dict of IOCs keyed by name with the value as the RFID. 
        entity_type : {"IpAddress", "Hash", "InternetDomainName"}
            Name of Recorded Future entity type for IOC.
        mode : {"core", "related", "debug"}
            Subset of features to return with enrichment. "core" is default.
        '''
        self.rfqapi = RFAPI(token)
        self.response = collections.OrderedDict()
        # need all features early for scoring; they're removed later
        # need to test whether this can be avoided
        keys = self._FEATURES['core']
        keys.update(self._FEATURES['debug'])
        if mode in ('related', 'debug'):
            keys.update(self._FEATURES['related'])
        if mode not in ('core', 'related', 'debug'):
            raise ValueError('"mode" must be one of ("core", "related", "debug"). Input: %s.' % mode)
        self.mode = mode
        self.entity_type = entity_type
        if isinstance(iocs, list):
            self.iocs = self._get_rfids(iocs)
        elif isinstance(iocs, dict):
            self.iocs = iocs
        else:
            raise ValueError('"iocs" must be list or dict.')
        for ioc in self.iocs:
            new_resp = {}
            for key in keys:
                new_resp[key] = keys[key]
                if key == 'Name':
                    new_resp[key] = ioc
                elif key == 'RFID':
                    new_resp[key] = self.iocs[ioc]
                elif key == 'EntityType':
                    new_resp[key] = self.entity_type
            self.response[ioc] = new_resp
        self.keys = keys
    
    def get_keys(self, mode=None):
        '''Getter for the keys in the response.
        '''
        return [key for key in self.keys if key not in self._get_extra_features(mode)]
    
    def _get_extra_features(self, mode=None):
        if not mode:
            mode = self.mode
        extra_features = []
        if mode in ('core', 'related'):
            extra_features = self._FEATURES['debug'].keys()
        return extra_features
        
    def enrich(self):
        '''Enriches the given IOC.
        Returns
        -------
        response : dict
            The enrichment package containing all keys requested by "mode" parameter.
        '''
        print "    Getting all references"
        max_index = None
        for names in _chunks(self.iocs.keys(), 250):
            refs, edetails = self._get_all_references(names)
            print "      Getting enrichment from references"
            max_index_cand = self._get_enrichment(refs, edetails)
            if max_index_cand < max_index or not max_index:
                # using < here because the references are no longer retrieved all from
                # the same query, so there may be timings, so we're looking at the minimax
                max_index = max_index_cand
            print "      Getting URL and Score"
        for ioc in self.response:
            ioc_resp = self.response[ioc]
            # Get RF URL
            if 'RFURL' in ioc_resp:
                ioc_resp['RFURL'] = _generate_rfURL_from_entity(ioc, ioc_resp.get('RFID', None))
            # Score the ref
            if 'Score' in ioc_resp:
                self.score(ioc_resp)
            # Remove unnecessary features
            extra_features = self._get_extra_features()
            for key in extra_features:
                del ioc_resp[key]
        return self.response, max_index

    def score(self, ioc_resp):
        spec_keys = ('7DayHits', '1DayHits')
        nonzero_keys = ('MaliciousHits',
                        'InfoSecHits',
                        'PasteHits',
                        'RelatedMalwareCount',
                        'RelatedCyberVulnerabilityCount',
                        'RelatedIpAddressCount',
                        'RelatedInternetDomainNameCount',
                        'RelatedHashCount')
        max_score = 0.0
        # score special keys
        if 'TotalHits' in self.keys:
            for key in filter(lambda k: k in self.keys, spec_keys):
                if ((ioc_resp[key]*2) > ioc_resp["TotalHits"]): 
                    ioc_resp['Score'] += 1
                max_score += len(spec_keys)
        # score nonzero keys
        for key in filter(lambda k: k in self.keys, nonzero_keys):
            if ioc_resp[key] > 0:
                ioc_resp['Score'] += 1
            max_score += 1
        ioc_resp['Score'] = ioc_resp['Score'] / max_score        

    def _get_enrichment(self, refs, edetails):
        max_index = None
        today = datetime.datetime.today()
        one_day_hit_string = _rfid_date_conv(today - datetime.timedelta(days=1))
        seven_day_hit_string = _rfid_date_conv(today - datetime.timedelta(days=7))
        
        # first get everything from all references
        print "    Processing references"
        ioc_to_rfid = self.iocs
        rfid_to_ioc = {}
        for ioc in filter(lambda i: ioc_to_rfid[i], ioc_to_rfid):
            rfid_to_ioc[ioc_to_rfid[ioc]] = ioc
        recent_pub = {"MostRecent": {},
                      "Paste": {},
                      "InfoSec": {},
                      "SocialMedia": {}}
        first_pub = {}
        for ref in refs:
            indexed = ref['document']['indexed']
            if indexed > max_index or not max_index:
                max_index = indexed
            fragment = ref['fragment'].lower()
            attrs = ref['attributes']
            source_topic = ref['document']['sourceId'].get('topic', None)
            source_media_type = ref['document']['sourceId'].get('media_type', None)
            pub_date = ref['document']['published']
            # get entities mentioned
            rfids = filter(lambda ioc: ioc in rfid_to_ioc, attrs.get('entities', []))
            ioc_rfids = [rfid for rfid in rfids if rfid in rfid_to_ioc]
            # get string hits that aren't included in the entity hits
            other_hits = [ioc for ioc in ioc_to_rfid if (ioc in fragment and ioc_to_rfid[ioc] not in ioc_rfids)]
            # increment hit counts and get recent hits
            iocs = [rfid_to_ioc[rfid] for rfid in ioc_rfids]
            for ioc in iocs + other_hits:
                ioc_resp = self.response[ioc]
                # update dates
                recent_pub['MostRecent'][ioc] = self._safe_update_date(ioc_resp, 
                                                                       pub_date, 
                                                                       recent_pub['MostRecent'][ioc] if ioc in recent_pub['MostRecent'] else '', 
                                                                       'MostRecent', 
                                                                       pub_date > recent_pub['MostRecent'][ioc] if ioc in recent_pub['MostRecent'] and len(recent_pub['MostRecent'][ioc]) > 0 else True)
                first_pub[ioc] = self._safe_update_date(ioc_resp, 
                                                        pub_date, 
                                                        first_pub[ioc] if ioc in first_pub else '', 
                                                        'FirstPublished', 
                                                        pub_date < first_pub[ioc] if ioc in first_pub and len(first_pub[ioc]) > 0 else True)
                # update hit counters
                self._safe_update_hits(ioc_resp, 
                                       'TotalHits', 
                                       True)
                self._safe_update_hits(ioc_resp, 
                                       '1DayHits', 
                                       pub_date >= one_day_hit_string)
                self._safe_update_hits(ioc_resp, 
                                       '7DayHits', 
                                       pub_date >= seven_day_hit_string)
                self._safe_update_hits(ioc_resp, 
                                       'MaliciousHits', 
                                       any(term in fragment for term in self._MALICIOUS_INDICATORS))
                # update hit counters and references
                conditions = {"InfoSec": source_topic == 'KPzZAE',
                              "Paste": source_media_type == 'KDS1Zp',
                              "SocialMedia": source_media_type == 'JxSEtC'}
                for key in conditions:
                    condition = conditions[key]
                    recent_pub[key][ioc] = self._safe_update_hits_and_refs(ioc_resp,
                                                                           ref,
                                                                           key,
                                                                           condition,
                                                                           recent_pub[key][ioc] if ioc in recent_pub[key] else '',
                                                                           pub_date > recent_pub[key][ioc] if ioc in recent_pub[key] and len(recent_pub[key][ioc]) > 0 else True)
                # update references for first and recent
                self._safe_update_refs(ioc_resp, 
                                       ref, 
                                       'MostRecent', 
                                       pub_date == recent_pub['MostRecent'][ioc])
                self._safe_update_refs(ioc_resp, 
                                       ref, 
                                       'First', 
                                       pub_date == first_pub[ioc])
        # get related content at fragment scope
        if self.mode in ('debug', 'related') and self._RELATED_ENTITY_SCOPE == 'fragment':
            self._safe_get_related_entities_from_frags(refs, edetails)
        # get related content at document scope
        if self.mode in ('debug', 'related') and self._RELATED_ENTITY_SCOPE == 'document':
            # print "Getting related content from documents"
            docs = self._get_docs()
            self._safe_get_related_entities_from_docs(docs)
        return max_index    

    def _safe_update_hits_and_refs(self, ioc_resp, ref, key, condition, cur_date, date_condition):
        pub_date = ref['document']['published']
        date_update = self._safe_update_date(ioc_resp, pub_date, cur_date, key, date_condition and condition)
        if condition:
            # update hits
            self._safe_update_hits(ioc_resp, key + 'Hits', condition)
            # get recent frags
            self._safe_update_refs(ioc_resp, ref, 'Recent' + key, pub_date == date_update)
        return date_update
                    
    def _safe_update_date(self, ioc_resp, date, existing_val, key, condition):
        if condition and key in ioc_resp:
            ioc_resp[key] = date
        return date if condition else existing_val
    
    def _safe_update_hits(self, ioc_resp, key, condition):
        if condition and key in ioc_resp:
            ioc_resp[key] += 1

    def _safe_update_refs(self, ioc_resp, ref, key, condition):
        if condition:
            key_suffixes = {'Source': ref['document']['sourceId']['name'].replace('\n', ' ').replace('\r', ' '),
                            'Title': ref['document']['title'].replace('\n', ' ').replace('\r', ' '), 
                            'Fragment': ref['fragment'].replace('\n', ' ').replace('\r', ' '), 
                            'URL': ref['document']['url'] if 'url' in ref['document'] else ''}
            for suffix in filter(lambda suf: key + suf in ioc_resp, key_suffixes):
                ioc_resp[key + suffix] = key_suffixes[suffix]
                    
    def _get_all_references(self, names):
        refs = []
        seen_ids = set()
        edetails = {}
        q = {"instance": {"type": "Event",
                          "limit": 25000,
                          "searchtype": "scan"}}
        q['instance']['attributes'] = [[{"name": "Event.event_fragment", 'string': names}]]
        rfids = [self.iocs[name] for name in names if self.iocs[name]]
        q['instance']['attributes'][0].append({"name": "entities",
                                               "entity": {"id": rfids}})
        # print len(self.iocs.keys()),
        for res in self.rfqapi.paged_query(q):
            refs.extend([inst for inst in res['instances'] if inst['id'] not in seen_ids])
            seen_ids.update([inst['id'] for inst in res['instances']])
            edetails.update({ eid: res['entities'][eid] for eid in res['entities'] if res['entities'][eid]['type'] in self._RELATED_ENTITY_TYPES})
        return refs, edetails

    def _get_docs(self):
        all_docs = set()
        for names in _chunks(self.iocs.keys(), 250):
            q = {"instance": {"type": "Event"},
                 "output": {"count": {"axis": [{"name": "attributes.entities",
                                                "type": [self.entity_type],
                                                "aspect": "name"},
                                                "document"],
                                      "values": [self._INSTANCES_OR_DOCUMENTS]}}}
            q['instance']['attributes'] = [[{"name": "Event.event_fragment", 'string': names}]]
            rfids = [self.iocs[name] for name in names if self.iocs[name]]
            q['instance']['attributes'][0].append({"name": "entities",
                                                   "entity": {"id": rfids}})
            res = self.rfqapi.query(q)
            counts = res["counts"][0]
            if len(counts) != 0:
                for ioc in filter(lambda i: i in self.iocs, counts):
                    docids = counts[ioc].keys()
                    self.response[ioc]['DocumentIds'] = docids
                    all_docs.update(docids)
        return list(all_docs)

    def _safe_get_related_entities_from_frags(self, refs, edetails):
        ioc_to_rfid = self.iocs
        rfid_to_ioc = {}
        for ioc in filter(lambda i: ioc_to_rfid[i], ioc_to_rfid):
            rfid_to_ioc[ioc_to_rfid[ioc]] = ioc
        entities_to_lookup = set()
        for ref in refs:
            related_ents = ref['attributes'].get('extended_entities', ref['attributes'].get('entities', []))
            entities_to_lookup.update([eid for eid in related_ents if eid not in edetails])
        # print "Updating entity resolution"
        edetails.update(self._resolve_related_entities(list(entities_to_lookup)))
        # print "Updated related entities"
        for ref in refs:
            fragment = ref['fragment'].lower()
            # get related entities from reference
            related_ents = ref['attributes'].get('extended_entities', ref['attributes'].get('entities', []))
            # get entities mentioned
            rfids = filter(lambda ioc: ioc in rfid_to_ioc, related_ents)
            ioc_rfids = [rfid for rfid in rfids if rfid in rfid_to_ioc]
            # get string hits that aren't included in the entity hits
            other_hits = [ioc for ioc in ioc_to_rfid if (ioc in fragment and ioc_to_rfid[ioc] not in ioc_rfids)]
            iocs = [rfid_to_ioc[rfid] for rfid in ioc_rfids]
            for ioc in iocs + other_hits:
                ioc_resp = self.response[ioc]
                for ent in filter(lambda eid: eid in edetails and eid != ioc_resp['RFID'], related_ents):
                    etype, name = edetails[ent]['type'], edetails[ent]['name']
                    if name not in ioc_resp['Related' + etype]:
                        ioc_resp['Related' + etype].append(name)
        for ioc in self.response:
            ioc_resp = self.response[ioc]
            for etype in self._RELATED_ENTITY_TYPES:
                if 'Related' + etype + 'Count' in ioc_resp:
                    ioc_resp['Related' + etype + 'Count'] = len(ioc_resp['Related' + etype])
                if 'Related' + etype not in self.keys and 'Related' + etype in ioc_resp:
                    del ioc_resp['Related' + etype]

    def _resolve_related_entities(self, eids):
        if len(eids) == 0:
            return {}
        results = {}
        for ents in _chunks(eids, 250):
            q = {"entity": {"id": ents,
                            "limit": 1001}}
            res = self.rfqapi.query(q)
            results.update({ eid: res['entity_details'][eid] for eid in res['entity_details'] if res['entity_details'][eid]['type'] in self._RELATED_ENTITY_TYPES })
        return results

    def _safe_get_related_entities_from_docs(self, docs):
        for docids in _chunks(docs, 250):
            q = {"instance": {"type": "Event",
                              "document": {"id": docids}},
                 "output": {"count": {"axis": ["document",
                                               {"name": "attributes.entities",
                                                "type": self._RELATED_ENTITY_TYPES,
                                                "aspect": "all"}],
                                      "values": [self._INSTANCES_OR_DOCUMENTS]}}}
            res = self.rfqapi.query(q)
            counts = res['counts'][0]
            for ioc in self.response:
                ioc_resp = self.response[ioc]
                for docid in filter(lambda did: did in counts, ioc_resp['DocumentIds']):
                    for asp_name in filter(lambda n: n != 'NONE', counts[docid]):
                        name, unused, etype = rf_agg_name_parser(asp_name)
                        if name == ioc: continue
                        # update related counts
                        if name not in ioc_resp['Related' + etype]:
                            ioc_resp['Related' + etype].append(name)
        for ioc in self.response:
            ioc_resp = self.response[ioc]
            if 'DocumentIds' not in self.keys and 'DocumentIds' in ioc_resp:
                del ioc_resp['DocumentIds']
            for etype in self._RELATED_ENTITY_TYPES:
                if 'Related' + etype + 'Count' in ioc_resp:
                    ioc_resp['Related' + etype + 'Count'] = len(ioc_resp['Related' + etype])
                if 'Related' + etype not in self.keys and 'Related' + etype in ioc_resp:
                    del ioc_resp['Related' + etype]
    
    def _get_rfids(self, iocs):
        new_iocs = collections.OrderedDict()
        edetails = {}
        for names in _chunks(iocs, 250):
            if len(names) == 0: continue
            q = {"entity": {"name": names, 
                            "type": self.entity_type,
                            "limit": 501}}
            res = self.rfqapi.query(q)
            if len(res['entities']) == 0: continue
            for ent in res['entities']:
                edetails[res['entity_details'][ent]['name']] = ent
        for ioc in iocs:
            new_iocs[ioc] = edetails[ioc] if ioc in edetails else None
        return new_iocs
Exemplo n.º 28
0
def main():
    options, args = parse_arguments()
    query = build_query(options, args)
    api = RFAPI(options.token)
    res = api.query(query)
    print res