示例#1
0
def store_twitter_data(data_path: str, data_type: str = "SocialMediaPost"):
    """
    Store collected Twitter data in Neo4j database.
    data_type: type of collected data (SocialMediaPosting or UserAccount)
    """

    # check input
    organisms = ["SocialMediaPost", "UserAccount"]
    assert data_type in organisms, "data_type should be 'SocialMediaPost' or 'UserAccount'!"

    # init
    nodes = {
        "SocialMediaPost": [],
        "Username": [],
        "Person": [],
        "Location": [],
        "Text": [],
        "Media": [],
        "UserAccount": [],
        "Domain": [],
        "Keyword": [],
        "HashValue": [],
    }
    edges = {"INCLUSION": [], "CO_OCCURRENCE": []}

    # read data
    data = pd.read_csv(data_path, sep=",", header=0)

    # drop duplicates
    data.drop_duplicates(subset=["id"], keep="first", inplace=True)

    # basic information of all nodes/relationships
    basic_info = {"timestamp": get_standardized_now(), "schemaVersion": 0.1}

    # extract nodes/links for every entry
    records = data.to_dict(orient="records")

    if data_type == "SocialMediaPost":

        for x in tqdm(records, desc="SocialMediaPosts", total=len(records), unit="records"):

            ###############################################################################################
            # SocialMediaPost

            # information given in data
            social_media_post = {
                "platform": "twitter",
                "id": str(x["id"]),
                "url": x["link"],
                "shared": x["retweet"],
                "likesCount": x["likes_count"],
                "repliesCount": x["replies_count"],
                "sharesCount": x["retweets_count"],
                "datePublished": standardize_date_time(date_time=(int(x["created_at"] / 1000)), timestamp=True),
            }

            """
            Unfortunately, given the information Twint extracts, it can not be determined whether the posting
            is a reply or the original one.
            """

            if x["retweet"]:
                social_media_post["type"] = "share"

            # add basic information
            social_media_post.update(basic_info)

            # add unique id
            social_media_post_unique_id = create_unique_id(data=social_media_post, schema="SocialMediaPost")
            social_media_post["nodeId"] = social_media_post_unique_id

            # remove possible None and empty values
            social_media_post = clean_dict(d=social_media_post)

            # check schema
            if check_schema(data=social_media_post, schema="SocialMediaPost"):
                # add node to graph
                nodes["SocialMediaPost"].append(social_media_post)
            else:
                continue

            ###############################################################################################
            # Username

            # information given in data
            username = {"username": x["username"]}

            # add basic information
            username.update(basic_info)

            # add unique id
            username_unique_id = create_unique_id(data=username, schema="Username")
            username["nodeId"] = username_unique_id

            # remove possible None and empty values
            username = clean_dict(d=username)

            # check schema
            if check_schema(data=username, schema="Username"):
                # add node to graph
                nodes["Username"].append(username)
                # add INCLUSION relationship
                edges["INCLUSION"].append({"a": social_media_post_unique_id, "b": username_unique_id})

            ###############################################################################################
            # Person

            # information given
            person = extract_name(s=x["name"])

            # add basic information
            person.update(basic_info)

            # add unique id
            person_unique_id = create_unique_id(data=person, schema="Person")
            person["nodeId"] = person_unique_id

            # remove possible None and empty values
            person = clean_dict(d=person)

            # check schema
            if check_schema(data=person, schema="Person"):
                # add node to graph
                nodes["Person"].append(person)
                # add INCLUSION relationship
                edges["INCLUSION"].append({"a": social_media_post_unique_id, "b": person_unique_id})
                # add CO_OCCURRENCE relationship
                edges["CO_OCCURRENCE"].append({"a": username_unique_id, "b": person_unique_id})

            ###############################################################################################
            # Location

            # geo should not be NaN
            geo = x["geo"]
            if geo == geo:
                location = complete_location(data=geo, given="coordinates")
                # add basic information
                location.update(basic_info)

                # add unique id
                location_unique_id = create_unique_id(data=location, schema="Location")
                location["nodeId"] = location_unique_id

                # remove possible None and empty values
                location = clean_dict(d=location)

                # check schema
                if check_schema(data=location, schema="Location"):
                    # add node to graph
                    nodes["Location"].append(location)
                    # add INCLUSION relationship
                    edges["INCLUSION"].append({"a": social_media_post_unique_id, "b": location_unique_id})
                    # add CO_OCCURRENCE relationships
                    edges["CO_OCCURRENCE"].append({"a": username_unique_id, "b": location_unique_id})
                    edges["CO_OCCURRENCE"].append({"a": person_unique_id, "b": location_unique_id})

            ###############################################################################################
            # Media (Photos)

            media_urls = list(set([n.strip() for n in ast.literal_eval(x["photos"])]))

            for url in media_urls:
                # download
                file_path = download_media(url=url)
                if file_path is None:
                    continue

                # information given in data
                media = {"url": url, "type": "image", **extract_media_info(s=url)}

                # add basic information
                media.update(basic_info)

                # add unique id
                media_unique_id = create_unique_id(data=media, schema="Media")
                media["nodeId"] = media_unique_id

                # remove possible None and empty values
                media = clean_dict(d=media)

                # check schema
                if check_schema(data=media, schema="Media"):
                    # add node to graph
                    nodes["Media"].append(media)

                    # add INCLUSION relationship
                    edges["INCLUSION"].append({"a": social_media_post_unique_id, "b": media_unique_id})

                    # add hash value
                    hash_value = {"hashValue": get_checksum(file=file_path)}

                    # add basic information
                    hash_value.update(basic_info)

                    # add unique id
                    hash_value_unique_id = create_unique_id(data=hash_value, schema="HashValue")
                    hash_value["nodeId"] = hash_value_unique_id

                    # remove possible None and empty values
                    hash_value = clean_dict(d=hash_value)

                    # check schema
                    if check_schema(data=hash_value, schema="HashValue"):
                        # add node to graph
                        nodes["HashValue"].append(hash_value)
                        # add INCLUSION relationship
                        edges["INCLUSION"].append({"a": media_unique_id, "b": hash_value_unique_id})

            ###############################################################################################
            # Text

            # information given in data
            keywords = [n.strip() for n in ast.literal_eval(x["hashtags"])] + [
                n.strip() for n in ast.literal_eval(x["cashtags"])
            ]
            urls_in_text = [n.strip() for n in ast.literal_eval(x["urls"])]
            text = {"text": x["tweet"]}

            # add basic information
            text.update(basic_info)

            # add unique id
            text_unique_id = create_unique_id(data=text, schema="Text")
            text["nodeId"] = text_unique_id

            # remove possible None and empty values
            text = clean_dict(d=text)

            # check schema
            if check_schema(data=text, schema="Text"):
                # add node to graph
                nodes["Text"].append(text)
                # add INCLUSION relationship
                edges["INCLUSION"].append({"a": social_media_post_unique_id, "b": text_unique_id})

            ###############################################################################################
            # Keywords

            keywords = list(set([n.strip() for n in ast.literal_eval(x["hashtags"])]))

            # store node IDs of all username mentions
            node_ids = []

            for h_i, h in enumerate(keywords):
                # information given in data
                keyword = {"keyword": h}

                # add basic information
                keyword.update(basic_info)

                # add unique id
                keyword_unique_id = create_unique_id(data=keyword, schema="Keyword")
                keyword["nodeId"] = keyword_unique_id

                node_ids.append(keyword_unique_id)

                # remove possible None and empty values
                keyword = clean_dict(d=keyword)

                # check schema
                if check_schema(data=keyword, schema="Keyword"):
                    # add node to graph
                    nodes["Keyword"].append(keyword)
                    # add INCLUSION relationship
                    edges["INCLUSION"].append({"a": text_unique_id, "b": keyword_unique_id})
                    # add CO_OCCURRENCE relationship
                    for i in node_ids[:h_i]:
                        edges["CO_OCCURRENCE"].append({"a": i, "b": keyword_unique_id})

            ###############################################################################################
            # Domains

            # store node IDs of all username mentions
            node_ids = []

            for u_i, u in enumerate(list(set(urls_in_text))):
                # extract URL information
                extracted_url_info = extract_urls(u)

                try:
                    extracted_url_info = extracted_url_info[next(iter(extracted_url_info))]
                except StopIteration:
                    continue

                if not extracted_url_info:
                    continue

                # do not consider large webpages
                domain = extracted_url_info["domain"]

                if domain in ALEXA1M:
                    continue

                # information given in data
                domain = {"domain": domain}

                # add basic information
                domain.update(basic_info)

                # add unique id
                domain_unique_id = create_unique_id(data=domain, schema="Domain")
                domain["nodeId"] = domain_unique_id

                node_ids.append(domain_unique_id)

                # remove possible None and empty values
                domain = clean_dict(d=domain)

                # check schema
                if check_schema(data=domain, schema="Domain"):
                    # add node to graph
                    nodes["Domain"].append(domain)
                    # add INCLUSION relationship
                    edges["INCLUSION"].append({"a": text_unique_id, "b": domain_unique_id})
                    # add CO_OCCURRENCE relationship
                    for i in node_ids[:u_i]:
                        edges["CO_OCCURRENCE"].append({"a": i, "b": domain_unique_id})

            ###############################################################################################
            # Usernames (Mentions)

            mentions = list(set([n.strip() for n in ast.literal_eval(x["mentions"])]))

            # store node IDs of all username mentions
            node_ids = []

            for m_i, m in enumerate(mentions):
                # information given in data
                username = {"username": m}

                # add basic information
                username.update(basic_info)

                # add unique id
                username_unique_id = create_unique_id(data=username, schema="Username")
                username["nodeId"] = username_unique_id

                node_ids.append(username_unique_id)

                # remove possible None and empty values
                username = clean_dict(d=username)

                # check schema
                if check_schema(data=username, schema="Username"):
                    # add node to graph
                    nodes["Username"].append(username)
                    # add INCLUSION relationship
                    edges["INCLUSION"].append({"a": text_unique_id, "b": username_unique_id})
                    # add CO_OCCURRENCE relationship
                    for i in node_ids[:m_i]:
                        edges["CO_OCCURRENCE"].append({"a": i, "b": username_unique_id})

    if data_type == "UserAccount":

        for x in tqdm(records, desc="UserAccounts", total=len(records), unit="records"):

            ###############################################################################################
            # UserAccount

            # information given in data
            user_account = {
                "private": bool(x["private"]),
                "verifiedByPlatform": bool(x["verified"]),
                "followersCount": x["followers"],
                "followingCount": x["following"],
                "dateTimeJoined": standardize_date_time(
                    date_time=(x["join_date"] + " " + x["join_time"]), format="%d %b %Y %I:%M %p"
                ),
                "mediaCount": x["media"],
                "postingsCount": x["tweets"],
                "platform": "twitter",
                "id": str(x["id"]),
                "url": ("https:/twitter.com/" + x["username"]),
                "likesCount": x["likes"],
            }

            # add basic information
            user_account.update(basic_info)

            # add unique id
            user_account_unique_id = create_unique_id(data=user_account, schema="UserAccount")
            user_account["nodeId"] = user_account_unique_id

            # remove possible None and empty values
            user_account = clean_dict(d=user_account)

            # check schema
            if check_schema(data=user_account, schema="UserAccount"):
                # add node to graph
                nodes["UserAccount"].append(user_account)
            else:
                continue

            ###############################################################################################
            # Username

            """
            The username has already been added to the graph during storage of collected Tweets.
            The unique ID of the node is calculated for later use.
            An INCLUSION edge is added between the usename and the user account.
            """

            # unique id
            username_unique_id = create_unique_id(data={"username": x["username"]}, schema="Username")

            # add INCLUSION relationship
            edges["INCLUSION"].append({"a": user_account_unique_id, "b": username_unique_id})

            ###############################################################################################
            # Person

            """
            The person has already been added to the graph during storage of collected Tweets.
            The unique ID of the node is calculated for later use.
            An INCLUSION edge is added between the person and the user account.
            """

            # unique id
            person_unique_id = create_unique_id(data=extract_name(s=x["name"]), schema="Person")

            # add INCLUSION relationship
            edges["INCLUSION"].append({"a": user_account_unique_id, "b": person_unique_id})

            ###############################################################################################
            # Text

            text = x["bio"]

            # text should not be NaN
            if text == text:
                # information given in data
                text = {"text": text}

                # add basic information
                text.update(basic_info)

                # add unique id
                text_unique_id = create_unique_id(data=text, schema="Text")
                text["nodeId"] = text_unique_id

                # remove possible None and empty values
                text = clean_dict(d=text)

                # check schema
                if check_schema(data=text, schema="Text"):
                    # add node to graph
                    nodes["Text"].append(text)
                    # add INCLUSION relationship
                    edges["INCLUSION"].append({"a": user_account_unique_id, "b": text_unique_id})

            ###############################################################################################
            # Domains

            text = x["bio"]

            if text == text:
                # extract urls form bio
                extracted_urls = extract_urls(text)

                # store node IDs of all username mentions
                node_ids = []

                for u_i, u in enumerate(list(set(extracted_urls.keys()))):
                    # URL information
                    url_info = extracted_urls[u]

                    if not url_info:
                        continue

                    # do not consider large webpages
                    domain = url_info["domain"]

                    if domain in ALEXA1M:
                        continue

                    # information given in data
                    domain = {"domain": domain}

                    # add basic information
                    domain.update(basic_info)

                    # add unique id
                    domain_unique_id = create_unique_id(data=domain, schema="Domain")
                    domain["nodeId"] = domain_unique_id

                    node_ids.append(domain_unique_id)

                    # remove possible None and empty values
                    domain = clean_dict(d=domain)

                    # check schema
                    if check_schema(data=domain, schema="Domain"):
                        # add node to graph
                        nodes["Domain"].append(domain)
                        # add INCLUSION relationship
                        edges["INCLUSION"].append({"a": text_unique_id, "b": domain_unique_id})
                        # add CO_OCCURRENCE relationship
                        for i in node_ids[:u_i]:
                            edges["CO_OCCURRENCE"].append({"a": i, "b": domain_unique_id})

            ###############################################################################################
            # Location

            # location should be given
            location = x["location"]

            # location should not be NaN
            if location == location:
                location = complete_location(data=location, given="address")

                # only continue if location could be retrieved
                if location:
                    # add basic information
                    location.update(basic_info)

                    # add unique id
                    location_unique_id = create_unique_id(data=location, schema="Location")
                    location["nodeId"] = location_unique_id

                    # remove possible None and empty values
                    location = clean_dict(d=location)

                    # check schema
                    if check_schema(data=location, schema="Location"):
                        # add node to graph
                        nodes["Location"].append(location)
                        # add INCLUSION relationship
                        edges["INCLUSION"].append({"a": user_account_unique_id, "b": location_unique_id})
                        # add CO_OCCURRENCE relationships
                        edges["CO_OCCURRENCE"].append({"a": username_unique_id, "b": location_unique_id})
                        edges["CO_OCCURRENCE"].append({"a": person_unique_id, "b": location_unique_id})

            ###############################################################################################
            # Media (profile image and profile background image)

            media_urls = list(set([x["profile_image_url"], x["background_image"]]))

            for i, url in enumerate(media_urls):
                # URL should not be NaN
                if url == url:
                    # download
                    file_path = download_media(url=url)
                    if file_path is None:
                        continue

                    # information given in data
                    media = {"url": url, "type": "image", **extract_media_info(s=url)}

                    # add type of image properties
                    if i == 0:
                        media["profileImage"] = True
                    if i == 1:
                        media["profileBackgroundImage"] = True

                    # add basic information
                    media.update(basic_info)

                    # add unique id
                    media_unique_id = create_unique_id(data=media, schema="Media")
                    media["nodeId"] = media_unique_id

                    # remove possible None and empty values
                    media = clean_dict(d=media)

                    # check schema
                    if check_schema(data=media, schema="Media"):
                        # add node to graph
                        nodes["Media"].append(media)

                        # add INCLUSION relationship
                        edges["INCLUSION"].append({"a": user_account_unique_id, "b": media_unique_id})

                        # add hash value
                        hash_value = {"hashValue": get_checksum(file=file_path)}

                        # add basic information
                        hash_value.update(basic_info)

                        # add unique id
                        hash_value_unique_id = create_unique_id(data=hash_value, schema="HashValue")
                        hash_value["nodeId"] = hash_value_unique_id

                        # remove possible None and empty values
                        hash_value = clean_dict(d=hash_value)

                        # check schema
                        if check_schema(data=hash_value, schema="HashValue"):
                            # add node to graph
                            nodes["HashValue"].append(hash_value)
                            # add INCLUSION relationship
                            edges["INCLUSION"].append({"a": media_unique_id, "b": hash_value_unique_id})

    # load data into database
    load_graph_into_db(nodes=nodes, edges=edges)
示例#2
0
                        edges_ner["INCLUSION"].append({
                            "a": t_node_id,
                            "b": node_id
                        })

                        # add CO_OCCURRENCE relationship to other derived entities
                        for i in node_ids[:n_i]:
                            edges_ner["CO_OCCURRENCE"].append({
                                "a": i,
                                "b": node_id
                            })

                        n_i += 1

        # store in intelligence graph
        load_graph_into_db(nodes=nodes_ner, edges=edges_ner)

    ###############################################################################################
    # Betweenness centrality

    if calculate_centrality:
        # calculate betweenness centrality and store as node property
        betweenness_centrality()

        # get top betweenness centralities for usernames and keywords
        with neo4j.session() as session:
            # username
            result_usernames = session.run(
                "MATCH (n:Username) RETURN n.betweennessCentrality AS betweennessCentrality, n.username AS username ORDER BY n.betweennessCentrality DESC LIMIT 10"
            )
            usernames_top_10 = [[r["username"], r["betweennessCentrality"]]
示例#3
0
def load_in_db():
    """
    Load collected information into database according
    AMONet model.
    """

    # get data
    # client and database to connect to MongoDB
    mongo_client, mongo_db = mongodb_connect()

    # get account related information
    related_information = list(mongo_db["osint_vk_accounts"].find(
        {
            "twitter_profile": {
                "$exists": "true"
            },
        },
        {
            "twitter_profile": 1,
            "screen_name": 1,
            "city": 1,
            "country": 1,
            "first_name": 1,
            "last_name": 1,
            "university_name": 1,
            "home_phone": 1,
            "instagram": 1,
            "site": 1,
            "skype": 1,
            "twitter": 1,
            "facebook": 1,
            "_id": 0,
        },
    ))

    # basic information of all nodes/relationships
    basic_info = {"timestamp": get_standardized_now(), "schemaVersion": 0.1}

    for x in tqdm(related_information,
                  desc="accounts",
                  total=len(related_information),
                  unit="records"):

        # store data in network
        # init
        nodes = {
            "Username": [],
            "UserAccount": [],
            "Location": [],
            "Person": [],
            "Organization": [],
            "Phone": [],
            "Domain": [],
        }
        edges = {"INCLUSION": [], "CO_OCCURRENCE": []}

        #######################################################################
        # VK account/username

        # information given in data
        vk_user_account = {
            "platform": "vk",
            "id": get_vk_id(username=x["screen_name"]),
            "url": ("https:/vk.com/" + x["screen_name"]),
        }
        time.sleep(0.3)

        # add basic information
        vk_user_account.update(basic_info)

        # add unique id
        vk_user_account_unique_id = create_unique_id(data=vk_user_account,
                                                     schema="UserAccount")
        vk_user_account["nodeId"] = vk_user_account_unique_id

        # remove possible None and empty values
        vk_user_account = clean_dict(d=vk_user_account)

        # check schema
        if check_schema(data=vk_user_account, schema="UserAccount"):
            # add node to graph
            nodes["UserAccount"].append(vk_user_account)

            # add username
            vk_username = {"username": x["screen_name"]}
            vk_username.update(basic_info)
            vk_username_unique_id = ""  # reset
            vk_username_unique_id = create_unique_id(data=vk_username,
                                                     schema="Username")
            vk_username["nodeId"] = vk_username_unique_id

            if check_schema(data=vk_username, schema="Username"):
                # add node to graph
                nodes["Username"].append(vk_username)
                # add INCLUSION relationship
                edges["INCLUSION"].append({
                    "a": vk_user_account_unique_id,
                    "b": vk_username_unique_id
                })
            else:
                vk_username_unique_id = ""  # reset
        else:
            vk_user_account_unique_id = ""  # reset
            continue

        #######################################################################
        # VK location

        try:
            # information given in data
            vk_location = complete_location(data=x["city"]["title"] + ", " +
                                            x["country"]["title"],
                                            given="address")
            time.sleep(0.3)

            # add basic information
            vk_location.update(basic_info)

            # add unique id
            vk_location_unique_id = create_unique_id(data=vk_location,
                                                     schema="Location")
            vk_location["nodeId"] = vk_location_unique_id

            # remove possible None and empty values
            vk_location = clean_dict(d=vk_location)

            # check schema
            if check_schema(data=vk_location, schema="Location"):
                # add node to graph
                nodes["Location"].append(vk_location)

                # add INCLUSION relationship
                edges["INCLUSION"].append({
                    "a": vk_user_account_unique_id,
                    "b": vk_location_unique_id
                })

                # add CO_OCCURRENCE relationship
                if vk_username_unique_id:
                    edges["CO_OCCURRENCE"].append({
                        "a": vk_username_unique_id,
                        "b": vk_location_unique_id
                    })
        except Exception as e:
            # reset
            vk_location_unique_id = ""

        #######################################################################
        # VK person

        try:
            # information given in data
            vk_person = extract_name(s=x["first_name"] + " " + x["last_name"])

            # add basic information
            vk_person.update(basic_info)

            # add unique id
            vk_person_unique_id = create_unique_id(data=vk_person,
                                                   schema="Person")
            vk_person["nodeId"] = vk_person_unique_id

            # remove possible None and empty values
            vk_person = clean_dict(d=vk_person)

            # check schema
            if check_schema(data=vk_person, schema="Person"):
                # add node to graph
                nodes["Person"].append(vk_person)

                # add INCLUSION relationship
                edges["INCLUSION"].append({
                    "a": vk_user_account_unique_id,
                    "b": vk_person_unique_id
                })

                # add CO_OCCURRENCE relationships
                if vk_username_unique_id:
                    edges["CO_OCCURRENCE"].append({
                        "a": vk_username_unique_id,
                        "b": vk_person_unique_id
                    })
                if vk_location_unique_id:
                    edges["CO_OCCURRENCE"].append({
                        "a": vk_location_unique_id,
                        "b": vk_person_unique_id
                    })
        except KeyError:
            # reset
            vk_person_unique_id = ""

        #######################################################################
        # VK organization

        try:
            # information given in data
            vk_organization = {
                "name": x["university_name"],
                "rawData": x["university_name"],
                "description": "university",
            }

            # add basic information
            vk_organization.update(basic_info)

            # add unique id
            vk_organization_unique_id = create_unique_id(data=vk_organization,
                                                         schema="Organization")
            vk_organization["nodeId"] = vk_organization_unique_id

            # remove possible None and empty values
            vk_organization = clean_dict(d=vk_organization)

            # check schema
            if check_schema(data=vk_organization, schema="Organization"):
                # add node to graph
                nodes["Organization"].append(vk_organization)

                # add INCLUSION relationship
                edges["INCLUSION"].append({
                    "a": vk_user_account_unique_id,
                    "b": vk_organization_unique_id
                })

                # add CO_OCCURRENCE relationships
                if vk_username_unique_id:
                    edges["CO_OCCURRENCE"].append({
                        "a":
                        vk_username_unique_id,
                        "b":
                        vk_organization_unique_id
                    })
                if vk_location_unique_id:
                    edges["CO_OCCURRENCE"].append({
                        "a":
                        vk_location_unique_id,
                        "b":
                        vk_organization_unique_id
                    })
                if vk_person_unique_id:
                    edges["CO_OCCURRENCE"].append({
                        "a":
                        vk_person_unique_id,
                        "b":
                        vk_organization_unique_id
                    })
        except KeyError:
            # reset
            vk_organization_unique_id = ""

        #######################################################################
        # VK phone

        try:
            # check if valid phone number
            phone = phonenumbers.parse(x["home_phone"], None)

            if phonenumbers.is_valid_number(phone):
                # information given in data
                vk_phone = {
                    "phone":
                    phonenumbers.format_number(
                        phone, phonenumbers.PhoneNumberFormat.E164)
                }

                # add basic information
                vk_phone.update(basic_info)

                # add unique id
                vk_phone_unique_id = create_unique_id(data=vk_phone,
                                                      schema="Phone")
                vk_phone["nodeId"] = vk_phone_unique_id

                # remove possible None and empty values
                vk_phone = clean_dict(d=vk_phone)

                # check schema
                if check_schema(data=vk_phone, schema="Phone"):
                    # add node to graph
                    nodes["Phone"].append(vk_phone)

                    # add INCLUSION relationship
                    edges["INCLUSION"].append({
                        "a": vk_user_account_unique_id,
                        "b": vk_phone_unique_id
                    })

                    # add CO_OCCURRENCE relationships
                    if vk_username_unique_id:
                        edges["CO_OCCURRENCE"].append({
                            "a": vk_username_unique_id,
                            "b": vk_phone_unique_id
                        })
                    if vk_location_unique_id:
                        edges["CO_OCCURRENCE"].append({
                            "a": vk_location_unique_id,
                            "b": vk_phone_unique_id
                        })
                    if vk_person_unique_id:
                        edges["CO_OCCURRENCE"].append({
                            "a": vk_person_unique_id,
                            "b": vk_phone_unique_id
                        })
                    if vk_organization_unique_id:
                        edges["CO_OCCURRENCE"].append({
                            "a": vk_organization_unique_id,
                            "b": vk_phone_unique_id
                        })
        except Exception as e:
            # reset
            vk_phone_unique_id = ""

        #######################################################################
        # VK domains

        if x["site"]:
            # information given in data
            extracted_domains = extract_urls(s=x["site"])

            if len(extracted_domains.items()) > 0:
                domains_unique_ids = []

                for d in extracted_domains.values():
                    vk_domain = {"domain": d["domain"]}

                    # add basic information
                    vk_domain.update(basic_info)

                    # add unique id
                    vk_domain_unique_id = create_unique_id(data=vk_domain,
                                                           schema="Domain")
                    vk_domain["nodeId"] = vk_domain_unique_id

                    # remove possible None and empty values
                    vk_domain = clean_dict(d=vk_domain)

                    # check schema
                    if check_schema(data=vk_domain, schema="Domain"):
                        # add node to graph
                        nodes["Domain"].append(vk_domain)

                        # add INCLUSION relationship
                        edges["INCLUSION"].append({
                            "a": vk_user_account_unique_id,
                            "b": vk_domain_unique_id
                        })

                        # add CO_OCCURRENCE relationships
                        if vk_username_unique_id:
                            edges["CO_OCCURRENCE"].append({
                                "a":
                                vk_username_unique_id,
                                "b":
                                vk_domain_unique_id
                            })
                        if vk_location_unique_id:
                            edges["CO_OCCURRENCE"].append({
                                "a":
                                vk_location_unique_id,
                                "b":
                                vk_domain_unique_id
                            })
                        if vk_person_unique_id:
                            edges["CO_OCCURRENCE"].append({
                                "a":
                                vk_person_unique_id,
                                "b":
                                vk_domain_unique_id
                            })
                        if vk_organization_unique_id:
                            edges["CO_OCCURRENCE"].append({
                                "a":
                                vk_organization_unique_id,
                                "b":
                                vk_domain_unique_id
                            })
                        if vk_phone_unique_id:
                            edges["CO_OCCURRENCE"].append({
                                "a":
                                vk_phone_unique_id,
                                "b":
                                vk_domain_unique_id
                            })
                        for i in domains_unique_ids:
                            edges["CO_OCCURRENCE"].append({
                                "a":
                                i,
                                "b":
                                vk_domain_unique_id
                            })

                        domains_unique_ids.append(vk_domain_unique_id)
            else:
                # reset
                domains_unique_ids = []
        else:
            # reset
            domains_unique_ids = []

        #######################################################################
        # VK additional usernames

        additional_usernames = [x["twitter"]]
        additional_usernames_unique_ids = []

        try:
            additional_usernames.append(x["facebook"])
        except KeyError:
            pass
        try:
            additional_usernames.append(x["instagram"])
        except KeyError:
            pass
        try:
            additional_usernames.append(x["skype"])
        except KeyError:
            pass

        # add additional usernames as nodes
        for u in additional_usernames:
            # information given in data
            vk_additional_username = {"username": u}

            # add basic information
            vk_additional_username.update(basic_info)

            # add unique id
            vk_additional_username_unique_id = create_unique_id(
                data=vk_additional_username, schema="Username")
            vk_additional_username["nodeId"] = vk_additional_username_unique_id

            # remove possible None and empty values
            vk_additional_username = clean_dict(d=vk_additional_username)

            # check schema
            if check_schema(data=vk_additional_username, schema="Username"):
                # add node to graph
                nodes["Username"].append(vk_additional_username)

                # add INCLUSION relationship
                edges["INCLUSION"].append({
                    "a": vk_user_account_unique_id,
                    "b": vk_additional_username_unique_id
                })

                # add CO_OCCURRENCE relationships
                if vk_username_unique_id:
                    edges["CO_OCCURRENCE"].append({
                        "a":
                        vk_username_unique_id,
                        "b":
                        vk_additional_username_unique_id
                    })
                if vk_location_unique_id:
                    edges["CO_OCCURRENCE"].append({
                        "a":
                        vk_location_unique_id,
                        "b":
                        vk_additional_username_unique_id
                    })
                if vk_person_unique_id:
                    edges["CO_OCCURRENCE"].append({
                        "a":
                        vk_person_unique_id,
                        "b":
                        vk_additional_username_unique_id
                    })
                if vk_organization_unique_id:
                    edges["CO_OCCURRENCE"].append({
                        "a":
                        vk_organization_unique_id,
                        "b":
                        vk_additional_username_unique_id
                    })
                if vk_phone_unique_id:
                    edges["CO_OCCURRENCE"].append({
                        "a":
                        vk_phone_unique_id,
                        "b":
                        vk_additional_username_unique_id
                    })
                for i in domains_unique_ids:
                    edges["CO_OCCURRENCE"].append({
                        "a":
                        i,
                        "b":
                        vk_additional_username_unique_id
                    })
                for i in additional_usernames_unique_ids:
                    edges["CO_OCCURRENCE"].append({
                        "a":
                        i,
                        "b":
                        vk_additional_username_unique_id
                    })

                additional_usernames_unique_ids.append(
                    vk_additional_username_unique_id)

        #######################################################################
        # Twitter account/username

        # information given in data
        twitter_user_account = {
            "platform": "twitter",
            "id": x["twitter_profile"]["id_str"],
            "url": ("https:/twitter.com/" + x["twitter_profile"]["id_str"]),
        }

        # add basic information
        twitter_user_account.update(basic_info)

        # add unique id
        twitter_user_account_unique_id = create_unique_id(
            data=twitter_user_account, schema="UserAccount")
        twitter_user_account["nodeId"] = twitter_user_account_unique_id

        # remove possible None and empty values
        twitter_user_account = clean_dict(d=twitter_user_account)

        # check schema
        if check_schema(data=twitter_user_account, schema="UserAccount"):
            # add node to graph
            nodes["UserAccount"].append(twitter_user_account)

            # add username
            twitter_username = {
                "username": x["twitter_profile"]["screen_name"]
            }
            twitter_username.update(basic_info)
            twitter_username_unique_id = ""  # reset
            twitter_username_unique_id = create_unique_id(
                data=twitter_username, schema="Username")
            twitter_username["nodeId"] = twitter_username_unique_id

            if check_schema(data=twitter_username, schema="Username"):
                # add node to graph
                nodes["Username"].append(twitter_username)
                # add INCLUSION relationships
                edges["INCLUSION"].append({
                    "a": twitter_user_account_unique_id,
                    "b": twitter_username_unique_id
                })
            else:
                twitter_username_unique_id = ""  # reset
        else:
            twitter_user_account_unique_id = ""  # reset
            continue

        #######################################################################
        # Twitter location

        if x["twitter_profile"]["location"]:
            # information given in data
            twitter_location = complete_location(
                data=x["twitter_profile"]["location"], given="address")
            time.sleep(0.3)

            if twitter_location:
                # add basic information
                twitter_location.update(basic_info)

                # add unique id
                twitter_location_unique_id = create_unique_id(
                    data=twitter_location, schema="Location")
                twitter_location["nodeId"] = twitter_location_unique_id

                # remove possible None and empty values
                twitter_location = clean_dict(d=twitter_location)

                # check schema
                if check_schema(data=twitter_location, schema="Location"):
                    # add node to graph
                    nodes["Location"].append(twitter_location)

                    # add INCLUSION relationship
                    edges["INCLUSION"].append({
                        "a": twitter_user_account_unique_id,
                        "b": twitter_location_unique_id
                    })

                    # add CO_OCCURRENCE relationship
                    if twitter_username_unique_id:
                        edges["CO_OCCURRENCE"].append({
                            "a":
                            twitter_username_unique_id,
                            "b":
                            twitter_location_unique_id
                        })
            else:
                # reset
                twitter_location_unique_id = ""
        else:
            # reset
            twitter_location_unique_id = ""

        #######################################################################
        # Twitter person

        if x["twitter_profile"]["name"]:
            # information given in data
            twitter_person = extract_name(s=x["twitter_profile"]["name"])

            # add basic information
            twitter_person.update(basic_info)

            # add unique id
            twitter_person_unique_id = create_unique_id(data=twitter_person,
                                                        schema="Person")
            twitter_person["nodeId"] = twitter_person_unique_id

            # remove possible None and empty values
            twitter_person = clean_dict(d=twitter_person)

            # check schema
            if check_schema(data=twitter_person, schema="Person"):
                # add node to graph
                nodes["Person"].append(twitter_person)

                # add INCLUSION relationship
                edges["INCLUSION"].append({
                    "a": twitter_user_account_unique_id,
                    "b": twitter_person_unique_id
                })

                # add CO_OCCURRENCE relationships
                if twitter_username_unique_id:
                    edges["CO_OCCURRENCE"].append({
                        "a": twitter_username_unique_id,
                        "b": twitter_person_unique_id
                    })
                if twitter_location_unique_id:
                    edges["CO_OCCURRENCE"].append({
                        "a": twitter_location_unique_id,
                        "b": twitter_person_unique_id
                    })
        else:
            # reset
            twitter_person_unique_id = ""

        #######################################################################
        # Twitter domains

        if x["twitter_profile"]["url"]:
            # information given in data
            extracted_domains = extract_urls(s=x["twitter_profile"]["url"])

            if len(extracted_domains.items()) > 0:
                domains_unique_ids = []

                for d in extracted_domains.values():
                    twitter_domain = {"domain": d["domain"]}

                    # add basic information
                    twitter_domain.update(basic_info)

                    # add unique id
                    twitter_domain_unique_id = create_unique_id(
                        data=twitter_domain, schema="Domain")
                    twitter_domain["nodeId"] = twitter_domain_unique_id

                    # remove possible None and empty values
                    twitter_domain = clean_dict(d=twitter_domain)

                    # check schema
                    if check_schema(data=twitter_domain, schema="Domain"):
                        # add node to graph
                        nodes["Domain"].append(twitter_domain)

                        # add INCLUSION relationship
                        edges["INCLUSION"].append({
                            "a": twitter_user_account_unique_id,
                            "b": twitter_domain_unique_id
                        })

                        # add CO_OCCURRENCE relationships
                        if twitter_username_unique_id:
                            edges["CO_OCCURRENCE"].append({
                                "a":
                                twitter_username_unique_id,
                                "b":
                                twitter_domain_unique_id
                            })
                        if twitter_location_unique_id:
                            edges["CO_OCCURRENCE"].append({
                                "a":
                                twitter_location_unique_id,
                                "b":
                                twitter_domain_unique_id
                            })
                        if twitter_person_unique_id:
                            edges["CO_OCCURRENCE"].append({
                                "a":
                                twitter_person_unique_id,
                                "b":
                                twitter_domain_unique_id
                            })
                        for i in domains_unique_ids:
                            edges["CO_OCCURRENCE"].append({
                                "a":
                                i,
                                "b":
                                twitter_domain_unique_id
                            })

                        domains_unique_ids.append(twitter_domain_unique_id)
            else:
                # reset
                domains_unique_ids = []
        else:
            # reset
            domains_unique_ids = []

        # load data into database
        load_graph_into_db(nodes=nodes, edges=edges)
示例#4
0
def cross_network_user_sample():
    """
    Extract sample of linked accounts of VK users and process to be used
    in AMONet framework.
    """

    # get data
    # client and database to connect to MongoDB
    mongo_client, mongo_db = mongodb_connect()

    # get account related information
    related_information = list(mongo_db["osint_vk_accounts"].find(
        {
            "$or": [
                {
                    "instagram": {
                        "$exists": "true"
                    }
                },
                {
                    "facebook": {
                        "$exists": "true"
                    }
                },
                {
                    "twitter": {
                        "$exists": "true"
                    }
                },
            ]
        },
        {
            "instagram": 1,
            "facebook": 1,
            "twitter": 1,
            "screen_name": 1,
            "_id": 0,
        },
        skip=40,
        limit=10,
    ))

    # create dataframe from information
    df = pd.DataFrame(related_information)
    df = df.rename(columns={"screen_name": "vk", "site": "website"})

    # drop users with no connected account, even no vk screen name
    df.dropna(how="all", inplace=True)

    # store data in network
    # init
    nodes = {
        "Username": [],
        "UserAccount": [],
    }
    edges = {"INCLUSION": [], "CO_OCCURRENCE": []}

    # basic information of all nodes/relationships
    basic_info = {"timestamp": get_standardized_now(), "schemaVersion": 0.1}

    for index, x in tqdm(df.iterrows(),
                         desc="VK accounts",
                         total=len(df),
                         unit="records"):

        #######################################################################
        # VK

        if x["vk"]:
            # information given in data
            vk_user_account = {
                "platform": "vk",
                "id": get_vk_id(username=x["vk"]),
                "url": ("https:/vk.com/" + x["vk"])
            }
            time.sleep(0.3)

            # add basic information
            vk_user_account.update(basic_info)

            # add unique id
            vk_user_account_unique_id = create_unique_id(data=vk_user_account,
                                                         schema="UserAccount")
            vk_user_account["nodeId"] = vk_user_account_unique_id

            # remove possible None and empty values
            vk_user_account = clean_dict(d=vk_user_account)

            # check schema
            if check_schema(data=vk_user_account, schema="UserAccount"):
                # add node to graph
                nodes["UserAccount"].append(vk_user_account)

                # add username
                vk_username = {"username": x["vk"]}
                vk_username.update(basic_info)
                vk_username_unique_id = ""  # reset
                vk_username_unique_id = create_unique_id(data=vk_username,
                                                         schema="Username")
                vk_username["nodeId"] = vk_username_unique_id

                if check_schema(data=vk_username, schema="Username"):
                    # add node to graph
                    nodes["Username"].append(vk_username)
                    # add INCLUSION relationship
                    edges["INCLUSION"].append({
                        "a": vk_user_account_unique_id,
                        "b": vk_username_unique_id
                    })
                else:
                    vk_username_unique_id = ""  # reset
            else:
                vk_user_account_unique_id = ""  # reset
                continue
        else:
            # reset
            vk_user_account_unique_id = ""
            vk_username_unique_id = ""

        #######################################################################
        # Twitter

        try:
            # information given in data
            twitter_user_account = {
                "platform": "twitter",
                "id": get_twitter_user(username=x["twitter"]).id_str,
                "url": ("https:/twitter.com/" + x["twitter"]),
            }

            # add basic information
            twitter_user_account.update(basic_info)

            # add unique id
            twitter_user_account_unique_id = create_unique_id(
                data=twitter_user_account, schema="UserAccount")
            twitter_user_account["nodeId"] = twitter_user_account_unique_id

            # remove possible None and empty values
            twitter_user_account = clean_dict(d=twitter_user_account)

            # check schema
            if check_schema(data=twitter_user_account, schema="UserAccount"):
                # add node to graph
                nodes["UserAccount"].append(twitter_user_account)

                # add username
                twitter_username = {"username": x["twitter"]}
                twitter_username.update(basic_info)
                twitter_username_unique_id = create_unique_id(
                    data=twitter_username, schema="Username")
                twitter_username["nodeId"] = twitter_username_unique_id

                if check_schema(data=twitter_username, schema="Username"):
                    # add node to graph
                    nodes["Username"].append(twitter_username)
                    # add INCLUSION relationships
                    edges["INCLUSION"].append({
                        "a": twitter_user_account_unique_id,
                        "b": twitter_username_unique_id
                    })
                    edges["INCLUSION"].append({
                        "a": vk_user_account_unique_id,
                        "b": twitter_username_unique_id
                    })
                    # add CO_OCCURRENCE relationship
                    edges["CO_OCCURRENCE"].append({
                        "a":
                        vk_username_unique_id,
                        "b":
                        twitter_username_unique_id
                    })
                else:
                    twitter_user_username_unique_id = ""  # reset
            else:
                twitter_user_account_unique_id = ""  # reset
        except Exception as e:
            # reset
            twitter_user_account_unique_id = ""
            twitter_username_unique_id = ""

        #######################################################################
        # Facebook

        if x["facebook"] and (type(x["facebook"]) is str):
            # get Facebook ID
            facebook_id = extract_facebook_id(s=x["facebook"])

            if facebook_id:
                # information given in data
                facebook_user_account = {
                    "platform": "facebook",
                    "id": facebook_id,
                    "url": ("https:/facebook.com/" + facebook_id),
                }

                # add basic information
                facebook_user_account.update(basic_info)

                # add unique id
                facebook_user_account_unique_id = ""  # reset
                facebook_user_account_unique_id = create_unique_id(
                    data=facebook_user_account, schema="UserAccount")
                facebook_user_account[
                    "nodeId"] = facebook_user_account_unique_id

                # remove possible None and empty values
                facebook_user_account = clean_dict(d=facebook_user_account)

                # check schema
                if check_schema(data=facebook_user_account,
                                schema="UserAccount"):
                    # add node to graph
                    nodes["UserAccount"].append(facebook_user_account)

                    # add username
                    facebook_username = {"username": facebook_id}
                    facebook_username.update(basic_info)
                    facebook_username_unique_id = ""  # reset
                    facebook_username_unique_id = create_unique_id(
                        data=facebook_username, schema="Username")
                    facebook_username["nodeId"] = facebook_username_unique_id

                    if check_schema(data=facebook_username, schema="Username"):
                        # add node to graph
                        nodes["Username"].append(facebook_username)
                        # add INCLUSION relationship
                        edges["INCLUSION"].append({
                            "a":
                            facebook_user_account_unique_id,
                            "b":
                            facebook_username_unique_id
                        })
                        edges["INCLUSION"].append({
                            "a":
                            vk_user_account_unique_id,
                            "b":
                            facebook_username_unique_id
                        })
                        # add CO_OCCURRENCE relationship
                        edges["CO_OCCURRENCE"].append({
                            "a":
                            vk_username_unique_id,
                            "b":
                            facebook_username_unique_id
                        })
                        if twitter_user_account_unique_id:
                            edges["CO_OCCURRENCE"].append({
                                "a":
                                twitter_username_unique_id,
                                "b":
                                facebook_username_unique_id
                            })
                    else:
                        facebook_username_unique_id = ""  # reset
                else:
                    facebook_user_account_unique_id = ""  # reset
        else:
            # reset
            facebook_user_account_unique_id = ""
            facebook_username_unique_id = ""

        #######################################################################
        # Instagram

        if x["instagram"] and (type(x["instagram"]) is str):
            # get Instagram ID
            instagram_id = get_instagram_id(username=x["instagram"])

            if instagram_id:
                # information given in data
                instagram_user_account = {
                    "platform": "instagram",
                    "id": instagram_id,
                    "url": ("https:/instagram.com/" + instagram_id),
                }

                # add basic information
                instagram_user_account.update(basic_info)

                # add unique id
                instagram_user_account_unique_id = ""  # reset
                instagram_user_account_unique_id = create_unique_id(
                    data=instagram_user_account, schema="UserAccount")
                instagram_user_account[
                    "nodeId"] = instagram_user_account_unique_id

                # remove possible None and empty values
                instagram_user_account = clean_dict(d=instagram_user_account)

                # check schema
                if check_schema(data=instagram_user_account,
                                schema="UserAccount"):
                    # add node to graph
                    nodes["UserAccount"].append(instagram_user_account)

                    # add username
                    instagram_username = {"username": x["instagram"]}
                    instagram_username.update(basic_info)
                    instagram_username_unique_id = ""  # reset
                    instagram_username_unique_id = create_unique_id(
                        data=instagram_username, schema="Username")
                    instagram_username["nodeId"] = instagram_username_unique_id

                    if check_schema(data=instagram_username,
                                    schema="Username"):
                        # add node to graph
                        nodes["Username"].append(instagram_username)
                        # add INCLUSION relationship
                        edges["INCLUSION"].append({
                            "a":
                            instagram_user_account_unique_id,
                            "b":
                            instagram_username_unique_id
                        })
                        edges["INCLUSION"].append({
                            "a":
                            vk_user_account_unique_id,
                            "b":
                            instagram_username_unique_id
                        })
                        # add CO_OCCURRENCE relationship
                        edges["CO_OCCURRENCE"].append({
                            "a":
                            vk_username_unique_id,
                            "b":
                            instagram_username_unique_id
                        })
                        if twitter_username_unique_id:
                            edges["CO_OCCURRENCE"].append({
                                "a":
                                twitter_username_unique_id,
                                "b":
                                instagram_username_unique_id
                            })
                        if facebook_user_account_unique_id:
                            edges["CO_OCCURRENCE"].append({
                                "a":
                                facebook_username_unique_id,
                                "b":
                                instagram_username_unique_id
                            })
                    else:
                        instagram_username_unique_id = ""  # reset
                else:
                    instagram_user_account_unique_id = ""  # reset
        else:
            # reset
            instagram_user_account_unique_id = ""
            instagram_username_unique_id = ""

    # load data into database
    load_graph_into_db(nodes=nodes, edges=edges)
示例#5
0
def enrich_node(node: dict) -> List[Dict]:
    """ Gather OSINT data and enrich given node. """

    # check input
    with open(config("OSINT_MATRIX")) as f:
        osint_matrix = json.load(f)
        osint_entities = list(osint_matrix.keys())

    entity_type = node["label"]

    assert entity_type in osint_entities, "OSINT enrichment not supported for given entity_type: %s!" % entity_type

    # init
    nodes_enrichment = {
        "SocialMediaPost": [],
        "Username": [],
        "Person": [],
        "Location": [],
        "Text": [],
        "Media": [],
        "UserAccount": [],
        "Domain": [],
        "Keyword": [],
        "Email": [],
        "IpAddress": [],
        "Organization": [],
        "Phone": [],
        "HashValue": [],
    }

    edges_enrichment = {"CO_OCCURRENCE": []}

    # get primary keys of entity node
    pks = get_primary_keys(entity_type)
    entity_value = [node[pk] for pk in pks]

    # get OSINT tools
    entity_tools = list(set(sum(list(osint_matrix[entity_type].values()), [])))

    # get OSINT data
    for t in entity_tools:
        # get data from OSINT tool
        d = tool2function[t](*entity_value)
        # store data
        if d:
            # store node IDs of all derived entities
            node_ids = []
            n_i = 0

            # iterate over different entity types received from OSINT tool
            for k, v in d.items():
                # add entities
                nodes_enrichment[k].extend(v)

                # iterate over different entities
                for n in v:
                    # store node ID
                    node_id = n["nodeId"]
                    node_ids.append(node_id)
                    # add CO_OCCURRENCE relationship to original node
                    edges_enrichment["CO_OCCURRENCE"].append({
                        "a":
                        node["nodeId"],
                        "b":
                        node_id
                    })

                    # add CO_OCCURRENCE relationship to other derived entities
                    for i in node_ids[:n_i]:
                        edges_enrichment["CO_OCCURRENCE"].append({
                            "a": i,
                            "b": node_id
                        })

                    n_i += 1

    # store in intelligence graph
    load_graph_into_db(nodes=nodes_enrichment, edges=edges_enrichment)

    # added nodes
    added_nodes = []
    for k, v in nodes_enrichment.items():
        [n.update({"label": k}) for n in v]
        added_nodes.extend(v)

    return added_nodes