def store_twitter_data(data_path: str, data_type: str = "SocialMediaPost"): """ Store collected Twitter data in Neo4j database. data_type: type of collected data (SocialMediaPosting or UserAccount) """ # check input organisms = ["SocialMediaPost", "UserAccount"] assert data_type in organisms, "data_type should be 'SocialMediaPost' or 'UserAccount'!" # init nodes = { "SocialMediaPost": [], "Username": [], "Person": [], "Location": [], "Text": [], "Media": [], "UserAccount": [], "Domain": [], "Keyword": [], "HashValue": [], } edges = {"INCLUSION": [], "CO_OCCURRENCE": []} # read data data = pd.read_csv(data_path, sep=",", header=0) # drop duplicates data.drop_duplicates(subset=["id"], keep="first", inplace=True) # basic information of all nodes/relationships basic_info = {"timestamp": get_standardized_now(), "schemaVersion": 0.1} # extract nodes/links for every entry records = data.to_dict(orient="records") if data_type == "SocialMediaPost": for x in tqdm(records, desc="SocialMediaPosts", total=len(records), unit="records"): ############################################################################################### # SocialMediaPost # information given in data social_media_post = { "platform": "twitter", "id": str(x["id"]), "url": x["link"], "shared": x["retweet"], "likesCount": x["likes_count"], "repliesCount": x["replies_count"], "sharesCount": x["retweets_count"], "datePublished": standardize_date_time(date_time=(int(x["created_at"] / 1000)), timestamp=True), } """ Unfortunately, given the information Twint extracts, it can not be determined whether the posting is a reply or the original one. """ if x["retweet"]: social_media_post["type"] = "share" # add basic information social_media_post.update(basic_info) # add unique id social_media_post_unique_id = create_unique_id(data=social_media_post, schema="SocialMediaPost") social_media_post["nodeId"] = social_media_post_unique_id # remove possible None and empty values social_media_post = clean_dict(d=social_media_post) # check schema if check_schema(data=social_media_post, schema="SocialMediaPost"): # add node to graph nodes["SocialMediaPost"].append(social_media_post) else: continue ############################################################################################### # Username # information given in data username = {"username": x["username"]} # add basic information username.update(basic_info) # add unique id username_unique_id = create_unique_id(data=username, schema="Username") username["nodeId"] = username_unique_id # remove possible None and empty values username = clean_dict(d=username) # check schema if check_schema(data=username, schema="Username"): # add node to graph nodes["Username"].append(username) # add INCLUSION relationship edges["INCLUSION"].append({"a": social_media_post_unique_id, "b": username_unique_id}) ############################################################################################### # Person # information given person = extract_name(s=x["name"]) # add basic information person.update(basic_info) # add unique id person_unique_id = create_unique_id(data=person, schema="Person") person["nodeId"] = person_unique_id # remove possible None and empty values person = clean_dict(d=person) # check schema if check_schema(data=person, schema="Person"): # add node to graph nodes["Person"].append(person) # add INCLUSION relationship edges["INCLUSION"].append({"a": social_media_post_unique_id, "b": person_unique_id}) # add CO_OCCURRENCE relationship edges["CO_OCCURRENCE"].append({"a": username_unique_id, "b": person_unique_id}) ############################################################################################### # Location # geo should not be NaN geo = x["geo"] if geo == geo: location = complete_location(data=geo, given="coordinates") # add basic information location.update(basic_info) # add unique id location_unique_id = create_unique_id(data=location, schema="Location") location["nodeId"] = location_unique_id # remove possible None and empty values location = clean_dict(d=location) # check schema if check_schema(data=location, schema="Location"): # add node to graph nodes["Location"].append(location) # add INCLUSION relationship edges["INCLUSION"].append({"a": social_media_post_unique_id, "b": location_unique_id}) # add CO_OCCURRENCE relationships edges["CO_OCCURRENCE"].append({"a": username_unique_id, "b": location_unique_id}) edges["CO_OCCURRENCE"].append({"a": person_unique_id, "b": location_unique_id}) ############################################################################################### # Media (Photos) media_urls = list(set([n.strip() for n in ast.literal_eval(x["photos"])])) for url in media_urls: # download file_path = download_media(url=url) if file_path is None: continue # information given in data media = {"url": url, "type": "image", **extract_media_info(s=url)} # add basic information media.update(basic_info) # add unique id media_unique_id = create_unique_id(data=media, schema="Media") media["nodeId"] = media_unique_id # remove possible None and empty values media = clean_dict(d=media) # check schema if check_schema(data=media, schema="Media"): # add node to graph nodes["Media"].append(media) # add INCLUSION relationship edges["INCLUSION"].append({"a": social_media_post_unique_id, "b": media_unique_id}) # add hash value hash_value = {"hashValue": get_checksum(file=file_path)} # add basic information hash_value.update(basic_info) # add unique id hash_value_unique_id = create_unique_id(data=hash_value, schema="HashValue") hash_value["nodeId"] = hash_value_unique_id # remove possible None and empty values hash_value = clean_dict(d=hash_value) # check schema if check_schema(data=hash_value, schema="HashValue"): # add node to graph nodes["HashValue"].append(hash_value) # add INCLUSION relationship edges["INCLUSION"].append({"a": media_unique_id, "b": hash_value_unique_id}) ############################################################################################### # Text # information given in data keywords = [n.strip() for n in ast.literal_eval(x["hashtags"])] + [ n.strip() for n in ast.literal_eval(x["cashtags"]) ] urls_in_text = [n.strip() for n in ast.literal_eval(x["urls"])] text = {"text": x["tweet"]} # add basic information text.update(basic_info) # add unique id text_unique_id = create_unique_id(data=text, schema="Text") text["nodeId"] = text_unique_id # remove possible None and empty values text = clean_dict(d=text) # check schema if check_schema(data=text, schema="Text"): # add node to graph nodes["Text"].append(text) # add INCLUSION relationship edges["INCLUSION"].append({"a": social_media_post_unique_id, "b": text_unique_id}) ############################################################################################### # Keywords keywords = list(set([n.strip() for n in ast.literal_eval(x["hashtags"])])) # store node IDs of all username mentions node_ids = [] for h_i, h in enumerate(keywords): # information given in data keyword = {"keyword": h} # add basic information keyword.update(basic_info) # add unique id keyword_unique_id = create_unique_id(data=keyword, schema="Keyword") keyword["nodeId"] = keyword_unique_id node_ids.append(keyword_unique_id) # remove possible None and empty values keyword = clean_dict(d=keyword) # check schema if check_schema(data=keyword, schema="Keyword"): # add node to graph nodes["Keyword"].append(keyword) # add INCLUSION relationship edges["INCLUSION"].append({"a": text_unique_id, "b": keyword_unique_id}) # add CO_OCCURRENCE relationship for i in node_ids[:h_i]: edges["CO_OCCURRENCE"].append({"a": i, "b": keyword_unique_id}) ############################################################################################### # Domains # store node IDs of all username mentions node_ids = [] for u_i, u in enumerate(list(set(urls_in_text))): # extract URL information extracted_url_info = extract_urls(u) try: extracted_url_info = extracted_url_info[next(iter(extracted_url_info))] except StopIteration: continue if not extracted_url_info: continue # do not consider large webpages domain = extracted_url_info["domain"] if domain in ALEXA1M: continue # information given in data domain = {"domain": domain} # add basic information domain.update(basic_info) # add unique id domain_unique_id = create_unique_id(data=domain, schema="Domain") domain["nodeId"] = domain_unique_id node_ids.append(domain_unique_id) # remove possible None and empty values domain = clean_dict(d=domain) # check schema if check_schema(data=domain, schema="Domain"): # add node to graph nodes["Domain"].append(domain) # add INCLUSION relationship edges["INCLUSION"].append({"a": text_unique_id, "b": domain_unique_id}) # add CO_OCCURRENCE relationship for i in node_ids[:u_i]: edges["CO_OCCURRENCE"].append({"a": i, "b": domain_unique_id}) ############################################################################################### # Usernames (Mentions) mentions = list(set([n.strip() for n in ast.literal_eval(x["mentions"])])) # store node IDs of all username mentions node_ids = [] for m_i, m in enumerate(mentions): # information given in data username = {"username": m} # add basic information username.update(basic_info) # add unique id username_unique_id = create_unique_id(data=username, schema="Username") username["nodeId"] = username_unique_id node_ids.append(username_unique_id) # remove possible None and empty values username = clean_dict(d=username) # check schema if check_schema(data=username, schema="Username"): # add node to graph nodes["Username"].append(username) # add INCLUSION relationship edges["INCLUSION"].append({"a": text_unique_id, "b": username_unique_id}) # add CO_OCCURRENCE relationship for i in node_ids[:m_i]: edges["CO_OCCURRENCE"].append({"a": i, "b": username_unique_id}) if data_type == "UserAccount": for x in tqdm(records, desc="UserAccounts", total=len(records), unit="records"): ############################################################################################### # UserAccount # information given in data user_account = { "private": bool(x["private"]), "verifiedByPlatform": bool(x["verified"]), "followersCount": x["followers"], "followingCount": x["following"], "dateTimeJoined": standardize_date_time( date_time=(x["join_date"] + " " + x["join_time"]), format="%d %b %Y %I:%M %p" ), "mediaCount": x["media"], "postingsCount": x["tweets"], "platform": "twitter", "id": str(x["id"]), "url": ("https:/twitter.com/" + x["username"]), "likesCount": x["likes"], } # add basic information user_account.update(basic_info) # add unique id user_account_unique_id = create_unique_id(data=user_account, schema="UserAccount") user_account["nodeId"] = user_account_unique_id # remove possible None and empty values user_account = clean_dict(d=user_account) # check schema if check_schema(data=user_account, schema="UserAccount"): # add node to graph nodes["UserAccount"].append(user_account) else: continue ############################################################################################### # Username """ The username has already been added to the graph during storage of collected Tweets. The unique ID of the node is calculated for later use. An INCLUSION edge is added between the usename and the user account. """ # unique id username_unique_id = create_unique_id(data={"username": x["username"]}, schema="Username") # add INCLUSION relationship edges["INCLUSION"].append({"a": user_account_unique_id, "b": username_unique_id}) ############################################################################################### # Person """ The person has already been added to the graph during storage of collected Tweets. The unique ID of the node is calculated for later use. An INCLUSION edge is added between the person and the user account. """ # unique id person_unique_id = create_unique_id(data=extract_name(s=x["name"]), schema="Person") # add INCLUSION relationship edges["INCLUSION"].append({"a": user_account_unique_id, "b": person_unique_id}) ############################################################################################### # Text text = x["bio"] # text should not be NaN if text == text: # information given in data text = {"text": text} # add basic information text.update(basic_info) # add unique id text_unique_id = create_unique_id(data=text, schema="Text") text["nodeId"] = text_unique_id # remove possible None and empty values text = clean_dict(d=text) # check schema if check_schema(data=text, schema="Text"): # add node to graph nodes["Text"].append(text) # add INCLUSION relationship edges["INCLUSION"].append({"a": user_account_unique_id, "b": text_unique_id}) ############################################################################################### # Domains text = x["bio"] if text == text: # extract urls form bio extracted_urls = extract_urls(text) # store node IDs of all username mentions node_ids = [] for u_i, u in enumerate(list(set(extracted_urls.keys()))): # URL information url_info = extracted_urls[u] if not url_info: continue # do not consider large webpages domain = url_info["domain"] if domain in ALEXA1M: continue # information given in data domain = {"domain": domain} # add basic information domain.update(basic_info) # add unique id domain_unique_id = create_unique_id(data=domain, schema="Domain") domain["nodeId"] = domain_unique_id node_ids.append(domain_unique_id) # remove possible None and empty values domain = clean_dict(d=domain) # check schema if check_schema(data=domain, schema="Domain"): # add node to graph nodes["Domain"].append(domain) # add INCLUSION relationship edges["INCLUSION"].append({"a": text_unique_id, "b": domain_unique_id}) # add CO_OCCURRENCE relationship for i in node_ids[:u_i]: edges["CO_OCCURRENCE"].append({"a": i, "b": domain_unique_id}) ############################################################################################### # Location # location should be given location = x["location"] # location should not be NaN if location == location: location = complete_location(data=location, given="address") # only continue if location could be retrieved if location: # add basic information location.update(basic_info) # add unique id location_unique_id = create_unique_id(data=location, schema="Location") location["nodeId"] = location_unique_id # remove possible None and empty values location = clean_dict(d=location) # check schema if check_schema(data=location, schema="Location"): # add node to graph nodes["Location"].append(location) # add INCLUSION relationship edges["INCLUSION"].append({"a": user_account_unique_id, "b": location_unique_id}) # add CO_OCCURRENCE relationships edges["CO_OCCURRENCE"].append({"a": username_unique_id, "b": location_unique_id}) edges["CO_OCCURRENCE"].append({"a": person_unique_id, "b": location_unique_id}) ############################################################################################### # Media (profile image and profile background image) media_urls = list(set([x["profile_image_url"], x["background_image"]])) for i, url in enumerate(media_urls): # URL should not be NaN if url == url: # download file_path = download_media(url=url) if file_path is None: continue # information given in data media = {"url": url, "type": "image", **extract_media_info(s=url)} # add type of image properties if i == 0: media["profileImage"] = True if i == 1: media["profileBackgroundImage"] = True # add basic information media.update(basic_info) # add unique id media_unique_id = create_unique_id(data=media, schema="Media") media["nodeId"] = media_unique_id # remove possible None and empty values media = clean_dict(d=media) # check schema if check_schema(data=media, schema="Media"): # add node to graph nodes["Media"].append(media) # add INCLUSION relationship edges["INCLUSION"].append({"a": user_account_unique_id, "b": media_unique_id}) # add hash value hash_value = {"hashValue": get_checksum(file=file_path)} # add basic information hash_value.update(basic_info) # add unique id hash_value_unique_id = create_unique_id(data=hash_value, schema="HashValue") hash_value["nodeId"] = hash_value_unique_id # remove possible None and empty values hash_value = clean_dict(d=hash_value) # check schema if check_schema(data=hash_value, schema="HashValue"): # add node to graph nodes["HashValue"].append(hash_value) # add INCLUSION relationship edges["INCLUSION"].append({"a": media_unique_id, "b": hash_value_unique_id}) # load data into database load_graph_into_db(nodes=nodes, edges=edges)
edges_ner["INCLUSION"].append({ "a": t_node_id, "b": node_id }) # add CO_OCCURRENCE relationship to other derived entities for i in node_ids[:n_i]: edges_ner["CO_OCCURRENCE"].append({ "a": i, "b": node_id }) n_i += 1 # store in intelligence graph load_graph_into_db(nodes=nodes_ner, edges=edges_ner) ############################################################################################### # Betweenness centrality if calculate_centrality: # calculate betweenness centrality and store as node property betweenness_centrality() # get top betweenness centralities for usernames and keywords with neo4j.session() as session: # username result_usernames = session.run( "MATCH (n:Username) RETURN n.betweennessCentrality AS betweennessCentrality, n.username AS username ORDER BY n.betweennessCentrality DESC LIMIT 10" ) usernames_top_10 = [[r["username"], r["betweennessCentrality"]]
def load_in_db(): """ Load collected information into database according AMONet model. """ # get data # client and database to connect to MongoDB mongo_client, mongo_db = mongodb_connect() # get account related information related_information = list(mongo_db["osint_vk_accounts"].find( { "twitter_profile": { "$exists": "true" }, }, { "twitter_profile": 1, "screen_name": 1, "city": 1, "country": 1, "first_name": 1, "last_name": 1, "university_name": 1, "home_phone": 1, "instagram": 1, "site": 1, "skype": 1, "twitter": 1, "facebook": 1, "_id": 0, }, )) # basic information of all nodes/relationships basic_info = {"timestamp": get_standardized_now(), "schemaVersion": 0.1} for x in tqdm(related_information, desc="accounts", total=len(related_information), unit="records"): # store data in network # init nodes = { "Username": [], "UserAccount": [], "Location": [], "Person": [], "Organization": [], "Phone": [], "Domain": [], } edges = {"INCLUSION": [], "CO_OCCURRENCE": []} ####################################################################### # VK account/username # information given in data vk_user_account = { "platform": "vk", "id": get_vk_id(username=x["screen_name"]), "url": ("https:/vk.com/" + x["screen_name"]), } time.sleep(0.3) # add basic information vk_user_account.update(basic_info) # add unique id vk_user_account_unique_id = create_unique_id(data=vk_user_account, schema="UserAccount") vk_user_account["nodeId"] = vk_user_account_unique_id # remove possible None and empty values vk_user_account = clean_dict(d=vk_user_account) # check schema if check_schema(data=vk_user_account, schema="UserAccount"): # add node to graph nodes["UserAccount"].append(vk_user_account) # add username vk_username = {"username": x["screen_name"]} vk_username.update(basic_info) vk_username_unique_id = "" # reset vk_username_unique_id = create_unique_id(data=vk_username, schema="Username") vk_username["nodeId"] = vk_username_unique_id if check_schema(data=vk_username, schema="Username"): # add node to graph nodes["Username"].append(vk_username) # add INCLUSION relationship edges["INCLUSION"].append({ "a": vk_user_account_unique_id, "b": vk_username_unique_id }) else: vk_username_unique_id = "" # reset else: vk_user_account_unique_id = "" # reset continue ####################################################################### # VK location try: # information given in data vk_location = complete_location(data=x["city"]["title"] + ", " + x["country"]["title"], given="address") time.sleep(0.3) # add basic information vk_location.update(basic_info) # add unique id vk_location_unique_id = create_unique_id(data=vk_location, schema="Location") vk_location["nodeId"] = vk_location_unique_id # remove possible None and empty values vk_location = clean_dict(d=vk_location) # check schema if check_schema(data=vk_location, schema="Location"): # add node to graph nodes["Location"].append(vk_location) # add INCLUSION relationship edges["INCLUSION"].append({ "a": vk_user_account_unique_id, "b": vk_location_unique_id }) # add CO_OCCURRENCE relationship if vk_username_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_username_unique_id, "b": vk_location_unique_id }) except Exception as e: # reset vk_location_unique_id = "" ####################################################################### # VK person try: # information given in data vk_person = extract_name(s=x["first_name"] + " " + x["last_name"]) # add basic information vk_person.update(basic_info) # add unique id vk_person_unique_id = create_unique_id(data=vk_person, schema="Person") vk_person["nodeId"] = vk_person_unique_id # remove possible None and empty values vk_person = clean_dict(d=vk_person) # check schema if check_schema(data=vk_person, schema="Person"): # add node to graph nodes["Person"].append(vk_person) # add INCLUSION relationship edges["INCLUSION"].append({ "a": vk_user_account_unique_id, "b": vk_person_unique_id }) # add CO_OCCURRENCE relationships if vk_username_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_username_unique_id, "b": vk_person_unique_id }) if vk_location_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_location_unique_id, "b": vk_person_unique_id }) except KeyError: # reset vk_person_unique_id = "" ####################################################################### # VK organization try: # information given in data vk_organization = { "name": x["university_name"], "rawData": x["university_name"], "description": "university", } # add basic information vk_organization.update(basic_info) # add unique id vk_organization_unique_id = create_unique_id(data=vk_organization, schema="Organization") vk_organization["nodeId"] = vk_organization_unique_id # remove possible None and empty values vk_organization = clean_dict(d=vk_organization) # check schema if check_schema(data=vk_organization, schema="Organization"): # add node to graph nodes["Organization"].append(vk_organization) # add INCLUSION relationship edges["INCLUSION"].append({ "a": vk_user_account_unique_id, "b": vk_organization_unique_id }) # add CO_OCCURRENCE relationships if vk_username_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_username_unique_id, "b": vk_organization_unique_id }) if vk_location_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_location_unique_id, "b": vk_organization_unique_id }) if vk_person_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_person_unique_id, "b": vk_organization_unique_id }) except KeyError: # reset vk_organization_unique_id = "" ####################################################################### # VK phone try: # check if valid phone number phone = phonenumbers.parse(x["home_phone"], None) if phonenumbers.is_valid_number(phone): # information given in data vk_phone = { "phone": phonenumbers.format_number( phone, phonenumbers.PhoneNumberFormat.E164) } # add basic information vk_phone.update(basic_info) # add unique id vk_phone_unique_id = create_unique_id(data=vk_phone, schema="Phone") vk_phone["nodeId"] = vk_phone_unique_id # remove possible None and empty values vk_phone = clean_dict(d=vk_phone) # check schema if check_schema(data=vk_phone, schema="Phone"): # add node to graph nodes["Phone"].append(vk_phone) # add INCLUSION relationship edges["INCLUSION"].append({ "a": vk_user_account_unique_id, "b": vk_phone_unique_id }) # add CO_OCCURRENCE relationships if vk_username_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_username_unique_id, "b": vk_phone_unique_id }) if vk_location_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_location_unique_id, "b": vk_phone_unique_id }) if vk_person_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_person_unique_id, "b": vk_phone_unique_id }) if vk_organization_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_organization_unique_id, "b": vk_phone_unique_id }) except Exception as e: # reset vk_phone_unique_id = "" ####################################################################### # VK domains if x["site"]: # information given in data extracted_domains = extract_urls(s=x["site"]) if len(extracted_domains.items()) > 0: domains_unique_ids = [] for d in extracted_domains.values(): vk_domain = {"domain": d["domain"]} # add basic information vk_domain.update(basic_info) # add unique id vk_domain_unique_id = create_unique_id(data=vk_domain, schema="Domain") vk_domain["nodeId"] = vk_domain_unique_id # remove possible None and empty values vk_domain = clean_dict(d=vk_domain) # check schema if check_schema(data=vk_domain, schema="Domain"): # add node to graph nodes["Domain"].append(vk_domain) # add INCLUSION relationship edges["INCLUSION"].append({ "a": vk_user_account_unique_id, "b": vk_domain_unique_id }) # add CO_OCCURRENCE relationships if vk_username_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_username_unique_id, "b": vk_domain_unique_id }) if vk_location_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_location_unique_id, "b": vk_domain_unique_id }) if vk_person_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_person_unique_id, "b": vk_domain_unique_id }) if vk_organization_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_organization_unique_id, "b": vk_domain_unique_id }) if vk_phone_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_phone_unique_id, "b": vk_domain_unique_id }) for i in domains_unique_ids: edges["CO_OCCURRENCE"].append({ "a": i, "b": vk_domain_unique_id }) domains_unique_ids.append(vk_domain_unique_id) else: # reset domains_unique_ids = [] else: # reset domains_unique_ids = [] ####################################################################### # VK additional usernames additional_usernames = [x["twitter"]] additional_usernames_unique_ids = [] try: additional_usernames.append(x["facebook"]) except KeyError: pass try: additional_usernames.append(x["instagram"]) except KeyError: pass try: additional_usernames.append(x["skype"]) except KeyError: pass # add additional usernames as nodes for u in additional_usernames: # information given in data vk_additional_username = {"username": u} # add basic information vk_additional_username.update(basic_info) # add unique id vk_additional_username_unique_id = create_unique_id( data=vk_additional_username, schema="Username") vk_additional_username["nodeId"] = vk_additional_username_unique_id # remove possible None and empty values vk_additional_username = clean_dict(d=vk_additional_username) # check schema if check_schema(data=vk_additional_username, schema="Username"): # add node to graph nodes["Username"].append(vk_additional_username) # add INCLUSION relationship edges["INCLUSION"].append({ "a": vk_user_account_unique_id, "b": vk_additional_username_unique_id }) # add CO_OCCURRENCE relationships if vk_username_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_username_unique_id, "b": vk_additional_username_unique_id }) if vk_location_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_location_unique_id, "b": vk_additional_username_unique_id }) if vk_person_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_person_unique_id, "b": vk_additional_username_unique_id }) if vk_organization_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_organization_unique_id, "b": vk_additional_username_unique_id }) if vk_phone_unique_id: edges["CO_OCCURRENCE"].append({ "a": vk_phone_unique_id, "b": vk_additional_username_unique_id }) for i in domains_unique_ids: edges["CO_OCCURRENCE"].append({ "a": i, "b": vk_additional_username_unique_id }) for i in additional_usernames_unique_ids: edges["CO_OCCURRENCE"].append({ "a": i, "b": vk_additional_username_unique_id }) additional_usernames_unique_ids.append( vk_additional_username_unique_id) ####################################################################### # Twitter account/username # information given in data twitter_user_account = { "platform": "twitter", "id": x["twitter_profile"]["id_str"], "url": ("https:/twitter.com/" + x["twitter_profile"]["id_str"]), } # add basic information twitter_user_account.update(basic_info) # add unique id twitter_user_account_unique_id = create_unique_id( data=twitter_user_account, schema="UserAccount") twitter_user_account["nodeId"] = twitter_user_account_unique_id # remove possible None and empty values twitter_user_account = clean_dict(d=twitter_user_account) # check schema if check_schema(data=twitter_user_account, schema="UserAccount"): # add node to graph nodes["UserAccount"].append(twitter_user_account) # add username twitter_username = { "username": x["twitter_profile"]["screen_name"] } twitter_username.update(basic_info) twitter_username_unique_id = "" # reset twitter_username_unique_id = create_unique_id( data=twitter_username, schema="Username") twitter_username["nodeId"] = twitter_username_unique_id if check_schema(data=twitter_username, schema="Username"): # add node to graph nodes["Username"].append(twitter_username) # add INCLUSION relationships edges["INCLUSION"].append({ "a": twitter_user_account_unique_id, "b": twitter_username_unique_id }) else: twitter_username_unique_id = "" # reset else: twitter_user_account_unique_id = "" # reset continue ####################################################################### # Twitter location if x["twitter_profile"]["location"]: # information given in data twitter_location = complete_location( data=x["twitter_profile"]["location"], given="address") time.sleep(0.3) if twitter_location: # add basic information twitter_location.update(basic_info) # add unique id twitter_location_unique_id = create_unique_id( data=twitter_location, schema="Location") twitter_location["nodeId"] = twitter_location_unique_id # remove possible None and empty values twitter_location = clean_dict(d=twitter_location) # check schema if check_schema(data=twitter_location, schema="Location"): # add node to graph nodes["Location"].append(twitter_location) # add INCLUSION relationship edges["INCLUSION"].append({ "a": twitter_user_account_unique_id, "b": twitter_location_unique_id }) # add CO_OCCURRENCE relationship if twitter_username_unique_id: edges["CO_OCCURRENCE"].append({ "a": twitter_username_unique_id, "b": twitter_location_unique_id }) else: # reset twitter_location_unique_id = "" else: # reset twitter_location_unique_id = "" ####################################################################### # Twitter person if x["twitter_profile"]["name"]: # information given in data twitter_person = extract_name(s=x["twitter_profile"]["name"]) # add basic information twitter_person.update(basic_info) # add unique id twitter_person_unique_id = create_unique_id(data=twitter_person, schema="Person") twitter_person["nodeId"] = twitter_person_unique_id # remove possible None and empty values twitter_person = clean_dict(d=twitter_person) # check schema if check_schema(data=twitter_person, schema="Person"): # add node to graph nodes["Person"].append(twitter_person) # add INCLUSION relationship edges["INCLUSION"].append({ "a": twitter_user_account_unique_id, "b": twitter_person_unique_id }) # add CO_OCCURRENCE relationships if twitter_username_unique_id: edges["CO_OCCURRENCE"].append({ "a": twitter_username_unique_id, "b": twitter_person_unique_id }) if twitter_location_unique_id: edges["CO_OCCURRENCE"].append({ "a": twitter_location_unique_id, "b": twitter_person_unique_id }) else: # reset twitter_person_unique_id = "" ####################################################################### # Twitter domains if x["twitter_profile"]["url"]: # information given in data extracted_domains = extract_urls(s=x["twitter_profile"]["url"]) if len(extracted_domains.items()) > 0: domains_unique_ids = [] for d in extracted_domains.values(): twitter_domain = {"domain": d["domain"]} # add basic information twitter_domain.update(basic_info) # add unique id twitter_domain_unique_id = create_unique_id( data=twitter_domain, schema="Domain") twitter_domain["nodeId"] = twitter_domain_unique_id # remove possible None and empty values twitter_domain = clean_dict(d=twitter_domain) # check schema if check_schema(data=twitter_domain, schema="Domain"): # add node to graph nodes["Domain"].append(twitter_domain) # add INCLUSION relationship edges["INCLUSION"].append({ "a": twitter_user_account_unique_id, "b": twitter_domain_unique_id }) # add CO_OCCURRENCE relationships if twitter_username_unique_id: edges["CO_OCCURRENCE"].append({ "a": twitter_username_unique_id, "b": twitter_domain_unique_id }) if twitter_location_unique_id: edges["CO_OCCURRENCE"].append({ "a": twitter_location_unique_id, "b": twitter_domain_unique_id }) if twitter_person_unique_id: edges["CO_OCCURRENCE"].append({ "a": twitter_person_unique_id, "b": twitter_domain_unique_id }) for i in domains_unique_ids: edges["CO_OCCURRENCE"].append({ "a": i, "b": twitter_domain_unique_id }) domains_unique_ids.append(twitter_domain_unique_id) else: # reset domains_unique_ids = [] else: # reset domains_unique_ids = [] # load data into database load_graph_into_db(nodes=nodes, edges=edges)
def cross_network_user_sample(): """ Extract sample of linked accounts of VK users and process to be used in AMONet framework. """ # get data # client and database to connect to MongoDB mongo_client, mongo_db = mongodb_connect() # get account related information related_information = list(mongo_db["osint_vk_accounts"].find( { "$or": [ { "instagram": { "$exists": "true" } }, { "facebook": { "$exists": "true" } }, { "twitter": { "$exists": "true" } }, ] }, { "instagram": 1, "facebook": 1, "twitter": 1, "screen_name": 1, "_id": 0, }, skip=40, limit=10, )) # create dataframe from information df = pd.DataFrame(related_information) df = df.rename(columns={"screen_name": "vk", "site": "website"}) # drop users with no connected account, even no vk screen name df.dropna(how="all", inplace=True) # store data in network # init nodes = { "Username": [], "UserAccount": [], } edges = {"INCLUSION": [], "CO_OCCURRENCE": []} # basic information of all nodes/relationships basic_info = {"timestamp": get_standardized_now(), "schemaVersion": 0.1} for index, x in tqdm(df.iterrows(), desc="VK accounts", total=len(df), unit="records"): ####################################################################### # VK if x["vk"]: # information given in data vk_user_account = { "platform": "vk", "id": get_vk_id(username=x["vk"]), "url": ("https:/vk.com/" + x["vk"]) } time.sleep(0.3) # add basic information vk_user_account.update(basic_info) # add unique id vk_user_account_unique_id = create_unique_id(data=vk_user_account, schema="UserAccount") vk_user_account["nodeId"] = vk_user_account_unique_id # remove possible None and empty values vk_user_account = clean_dict(d=vk_user_account) # check schema if check_schema(data=vk_user_account, schema="UserAccount"): # add node to graph nodes["UserAccount"].append(vk_user_account) # add username vk_username = {"username": x["vk"]} vk_username.update(basic_info) vk_username_unique_id = "" # reset vk_username_unique_id = create_unique_id(data=vk_username, schema="Username") vk_username["nodeId"] = vk_username_unique_id if check_schema(data=vk_username, schema="Username"): # add node to graph nodes["Username"].append(vk_username) # add INCLUSION relationship edges["INCLUSION"].append({ "a": vk_user_account_unique_id, "b": vk_username_unique_id }) else: vk_username_unique_id = "" # reset else: vk_user_account_unique_id = "" # reset continue else: # reset vk_user_account_unique_id = "" vk_username_unique_id = "" ####################################################################### # Twitter try: # information given in data twitter_user_account = { "platform": "twitter", "id": get_twitter_user(username=x["twitter"]).id_str, "url": ("https:/twitter.com/" + x["twitter"]), } # add basic information twitter_user_account.update(basic_info) # add unique id twitter_user_account_unique_id = create_unique_id( data=twitter_user_account, schema="UserAccount") twitter_user_account["nodeId"] = twitter_user_account_unique_id # remove possible None and empty values twitter_user_account = clean_dict(d=twitter_user_account) # check schema if check_schema(data=twitter_user_account, schema="UserAccount"): # add node to graph nodes["UserAccount"].append(twitter_user_account) # add username twitter_username = {"username": x["twitter"]} twitter_username.update(basic_info) twitter_username_unique_id = create_unique_id( data=twitter_username, schema="Username") twitter_username["nodeId"] = twitter_username_unique_id if check_schema(data=twitter_username, schema="Username"): # add node to graph nodes["Username"].append(twitter_username) # add INCLUSION relationships edges["INCLUSION"].append({ "a": twitter_user_account_unique_id, "b": twitter_username_unique_id }) edges["INCLUSION"].append({ "a": vk_user_account_unique_id, "b": twitter_username_unique_id }) # add CO_OCCURRENCE relationship edges["CO_OCCURRENCE"].append({ "a": vk_username_unique_id, "b": twitter_username_unique_id }) else: twitter_user_username_unique_id = "" # reset else: twitter_user_account_unique_id = "" # reset except Exception as e: # reset twitter_user_account_unique_id = "" twitter_username_unique_id = "" ####################################################################### # Facebook if x["facebook"] and (type(x["facebook"]) is str): # get Facebook ID facebook_id = extract_facebook_id(s=x["facebook"]) if facebook_id: # information given in data facebook_user_account = { "platform": "facebook", "id": facebook_id, "url": ("https:/facebook.com/" + facebook_id), } # add basic information facebook_user_account.update(basic_info) # add unique id facebook_user_account_unique_id = "" # reset facebook_user_account_unique_id = create_unique_id( data=facebook_user_account, schema="UserAccount") facebook_user_account[ "nodeId"] = facebook_user_account_unique_id # remove possible None and empty values facebook_user_account = clean_dict(d=facebook_user_account) # check schema if check_schema(data=facebook_user_account, schema="UserAccount"): # add node to graph nodes["UserAccount"].append(facebook_user_account) # add username facebook_username = {"username": facebook_id} facebook_username.update(basic_info) facebook_username_unique_id = "" # reset facebook_username_unique_id = create_unique_id( data=facebook_username, schema="Username") facebook_username["nodeId"] = facebook_username_unique_id if check_schema(data=facebook_username, schema="Username"): # add node to graph nodes["Username"].append(facebook_username) # add INCLUSION relationship edges["INCLUSION"].append({ "a": facebook_user_account_unique_id, "b": facebook_username_unique_id }) edges["INCLUSION"].append({ "a": vk_user_account_unique_id, "b": facebook_username_unique_id }) # add CO_OCCURRENCE relationship edges["CO_OCCURRENCE"].append({ "a": vk_username_unique_id, "b": facebook_username_unique_id }) if twitter_user_account_unique_id: edges["CO_OCCURRENCE"].append({ "a": twitter_username_unique_id, "b": facebook_username_unique_id }) else: facebook_username_unique_id = "" # reset else: facebook_user_account_unique_id = "" # reset else: # reset facebook_user_account_unique_id = "" facebook_username_unique_id = "" ####################################################################### # Instagram if x["instagram"] and (type(x["instagram"]) is str): # get Instagram ID instagram_id = get_instagram_id(username=x["instagram"]) if instagram_id: # information given in data instagram_user_account = { "platform": "instagram", "id": instagram_id, "url": ("https:/instagram.com/" + instagram_id), } # add basic information instagram_user_account.update(basic_info) # add unique id instagram_user_account_unique_id = "" # reset instagram_user_account_unique_id = create_unique_id( data=instagram_user_account, schema="UserAccount") instagram_user_account[ "nodeId"] = instagram_user_account_unique_id # remove possible None and empty values instagram_user_account = clean_dict(d=instagram_user_account) # check schema if check_schema(data=instagram_user_account, schema="UserAccount"): # add node to graph nodes["UserAccount"].append(instagram_user_account) # add username instagram_username = {"username": x["instagram"]} instagram_username.update(basic_info) instagram_username_unique_id = "" # reset instagram_username_unique_id = create_unique_id( data=instagram_username, schema="Username") instagram_username["nodeId"] = instagram_username_unique_id if check_schema(data=instagram_username, schema="Username"): # add node to graph nodes["Username"].append(instagram_username) # add INCLUSION relationship edges["INCLUSION"].append({ "a": instagram_user_account_unique_id, "b": instagram_username_unique_id }) edges["INCLUSION"].append({ "a": vk_user_account_unique_id, "b": instagram_username_unique_id }) # add CO_OCCURRENCE relationship edges["CO_OCCURRENCE"].append({ "a": vk_username_unique_id, "b": instagram_username_unique_id }) if twitter_username_unique_id: edges["CO_OCCURRENCE"].append({ "a": twitter_username_unique_id, "b": instagram_username_unique_id }) if facebook_user_account_unique_id: edges["CO_OCCURRENCE"].append({ "a": facebook_username_unique_id, "b": instagram_username_unique_id }) else: instagram_username_unique_id = "" # reset else: instagram_user_account_unique_id = "" # reset else: # reset instagram_user_account_unique_id = "" instagram_username_unique_id = "" # load data into database load_graph_into_db(nodes=nodes, edges=edges)
def enrich_node(node: dict) -> List[Dict]: """ Gather OSINT data and enrich given node. """ # check input with open(config("OSINT_MATRIX")) as f: osint_matrix = json.load(f) osint_entities = list(osint_matrix.keys()) entity_type = node["label"] assert entity_type in osint_entities, "OSINT enrichment not supported for given entity_type: %s!" % entity_type # init nodes_enrichment = { "SocialMediaPost": [], "Username": [], "Person": [], "Location": [], "Text": [], "Media": [], "UserAccount": [], "Domain": [], "Keyword": [], "Email": [], "IpAddress": [], "Organization": [], "Phone": [], "HashValue": [], } edges_enrichment = {"CO_OCCURRENCE": []} # get primary keys of entity node pks = get_primary_keys(entity_type) entity_value = [node[pk] for pk in pks] # get OSINT tools entity_tools = list(set(sum(list(osint_matrix[entity_type].values()), []))) # get OSINT data for t in entity_tools: # get data from OSINT tool d = tool2function[t](*entity_value) # store data if d: # store node IDs of all derived entities node_ids = [] n_i = 0 # iterate over different entity types received from OSINT tool for k, v in d.items(): # add entities nodes_enrichment[k].extend(v) # iterate over different entities for n in v: # store node ID node_id = n["nodeId"] node_ids.append(node_id) # add CO_OCCURRENCE relationship to original node edges_enrichment["CO_OCCURRENCE"].append({ "a": node["nodeId"], "b": node_id }) # add CO_OCCURRENCE relationship to other derived entities for i in node_ids[:n_i]: edges_enrichment["CO_OCCURRENCE"].append({ "a": i, "b": node_id }) n_i += 1 # store in intelligence graph load_graph_into_db(nodes=nodes_enrichment, edges=edges_enrichment) # added nodes added_nodes = [] for k, v in nodes_enrichment.items(): [n.update({"label": k}) for n in v] added_nodes.extend(v) return added_nodes