示例#1
0
def insert_relationships_into_graph(graph: Graph,
                                    serialized_groups: dict,
                                    batch_size: int = DEFAULT_BATCH_SIZE):
    """
    Create and insert relationships (MEMBER_OF) into the neo4j database using py2neo bulk functions
    :param batch_size: batch size to upload items in bulk to DB
    :param graph: represents connection to neo4j backend
    :param serialized_groups: scraper's output
    """
    relationships_data = []
    for group_id, group_properties in serialized_groups.items():
        if 'participants' in group_properties:
            group_participants_numbers = group_properties['participants']

            for participant_number in group_participants_numbers:
                # Prepare relationships info for bulk insertion
                relationships_data.append((participant_number, {}, group_id))

    # Bulk insert all relationships into DB
    print(f"Inserting {len(relationships_data)} relationships into DB...")
    for batch_relationships in tqdm(
            list(divide_into_batches(relationships_data,
                                     batch_size=batch_size))):
        create_relationships(graph.auto(),
                             data=batch_relationships,
                             rel_type="MEMBER_OF",
                             start_node_key=("Participant", "phone_number"),
                             end_node_key=("Group", "group_id"))
示例#2
0
文件: do_neo.py 项目: 8lurry/neo
class DoNeo(object):
    def __init__(self, exec_default=False):
        if exec_default:
            self.connect_db(url)
            self.load_data(file)
            self.populate_db()

    def connect_db(self, url=None):
        if url:
            self.g = Graph(url)
        else:
            self.g = Graph()

    def load_data(self, file):
        with open(file) as f:
            self.data = json.load(f)

    def populate_db(self):
        for d in self.data:
            d['Property']['IdUnique'] = d['IdUnique']
            if d['Kind'] == 'node':
                if d['DeDuplication'] == None and self.g.nodes.match(
                        IdUnique=d['IdUnique']).count() == 0:
                    create_nodes(self.g.auto(), [d['Property']],
                                 labels={*d['Label']})
                else:
                    merge_nodes(self.g.auto(), [d['Property']],
                                (tuple(d['Label']), 'IdUnique'),
                                labels={*d['Label']})
            else:
                da = ((d['FromIdMaster']), d['Property'], (d['ToIdMaster']))
                if d['DeDuplication'] == None and self.g.relationships.match(
                        IdUnique=d['IdUnique']).count() == 0:
                    create_relationships(self.g.auto(), [da],
                                         d['Type'],
                                         start_node_key=(d['FromLabel'],
                                                         'IdMaster'),
                                         end_node_key=(d['ToLabel'],
                                                       'IdMaster'))
                else:
                    merge_relationships(self.g.auto(), [da],
                                        (d['Type'], 'IdUnique'),
                                        start_node_key=(d['FromLabel'],
                                                        'IdMaster'),
                                        end_node_key=(d['ToLabel'],
                                                      'IdMaster'))
示例#3
0
def insert_nodes_into_graph(graph: Graph,
                            serialized_groups: dict,
                            contacts: Optional[Dict[str, str]],
                            batch_size: int = DEFAULT_BATCH_SIZE):
    """
    Create and insert nodes (Participant/Group) into the neo4j database using py2neo bulk functions
    :param graph: represents connection to neo4j backend
    :param serialized_groups: scraper's output
    :param contacts: a dictionary converting from phone number to name
    :param batch_size: batch size to upload items in bulk to DB
    """
    participants = defaultdict(lambda: defaultdict(dict))
    groups = []

    for group_id, group_properties in serialized_groups.items():
        if 'participants' in group_properties:
            group_name = group_properties.get('group_name', None)
            group_participants_numbers = group_properties['participants']

            for participant_number in group_participants_numbers:
                if participant_number not in participants:
                    # Prepare participant data for bulk insertion
                    participants[participant_number][
                        'phone_number'] = participant_number
                    if contacts is not None:
                        participants[participant_number][
                            'name'] = contacts.get(participant_number, None)
            # Prepare group data for bulk insertion
            groups.append(dict(name=group_name, group_id=group_id))

    # Create participants nodes
    participants_list = participants.values()
    print(f"Inserting {len(participants_list)} participants into DB...")
    for batch_participants in tqdm(
            list(
                divide_into_batches(list(participants_list),
                                    batch_size=batch_size))):
        create_nodes(graph.auto(), batch_participants, labels={"Participant"})

    # Create groups nodes
    print(f"Inserting {len(groups)} groups into DB...")
    for batch_groups in tqdm(
            list(divide_into_batches(groups, batch_size=batch_size))):
        create_nodes(graph.auto(), batch_groups, labels={"Group"})
示例#4
0
def load(
    url: str = "http://download.geofabrik.de/europe/monaco-latest.osm.pbf",
    node_batch: int = 25000,
    relation_batch: int = 100,
    clear: bool = False,
):
    """load osm file into neo4j database

    Args:
        url (str, optional): location of osm file, could be in the cloud.
        Defaults to "http://download.geofabrik.de/europe/monaco-latest.osm.pbf".
        node_batch (int, optional): size of batches for nodes. Defaults to 25000.
        relation_batch (int, optional): size of batches for relations. Defaults to 50.
        clear (bool, optional): should database be cleared first. Defaults to False.
    """
    start_time = time()
    db = Graph(getenv("CON_STRING"),
               auth=(getenv("CON_USER"), getenv("CON_PASS")))
    print("connected to db")

    if clear:
        clear_db(db)

    neo_handler = NeoHandler()
    osm_handler = FileHandler(neo_handler)
    osm_handler.apply_file(url, locations=True)
    num_nodes = len(neo_handler.nodes)
    num_edges = len(neo_handler.edges)
    print(f"read file {url}, nodes: {num_nodes}, relations: {num_edges}")

    nodes = map(
        lambda n: {
            "node_id": n.node_id,
            "lat": n.lat,
            "long": n.long
        },
        neo_handler.nodes,
    )

    print(f"starting nodes with batch size {node_batch}")
    for batch in tqdm(
            batchify(nodes, batch_size=node_batch),
            total=ceil(num_nodes / node_batch),
    ):
        create_nodes(db.auto(), batch, labels={"Node"})

    edges = map(
        lambda e: (
            e.start_node_id,
            {
                "distance": e.distance,
                "rating": e.rating,
                "cost": e.cost
            },
            e.end_node_id,
        ),
        neo_handler.edges,
    )

    print(f"starting relations with batch size {relation_batch}")
    for batch in tqdm(
            batchify(edges, batch_size=relation_batch),
            total=ceil(num_edges / relation_batch),
    ):
        merge_relationships(
            db.auto(),
            batch,
            merge_key=("Route"),
            start_node_key=("Node", "node_id"),
            end_node_key=("Node", "node_id"),
        )

    print("creating gds graph")
    db.run(
        "CALL gds.graph.create( 'nodesGraph', 'Node', 'Route', { relationshipProperties: ['rating', 'cost', 'distance'] } )"
    )

    print(f"total time: {(time() - start_time)/60:.2f} minutes")