Пример #1
0
def skill_result_2_graph(resp_run_node_action):
    """
    convert skill_pb2.RespRunNodeAction to graphUpload.proto.Graph
    so that in python we have only one graph format.

    Keyword Arguments:
    resp_run_node_action -- skill_pb2.RespRunNodeAction, result from get_graph_from_neo4j

    Return:
    graphUpload_pb2.graph
    """
    graph_upload_request = graphUpload_pb2.GraphUploadRequest()

    for n in resp_run_node_action.graphs[0].graph.nodes:
        node = graph_upload_request.graph.nodes.add()
        for p in n.node_prop.props.entries:
            if p.key == '_type':
                node.props.type = p.value
            elif p.key == '_s_graph_upload_tag':
                graph_upload_request.uploadTag = p.value
            else:
                field = node.props.props.entries.add()
                field.key = p.key
                field.value = p.value
Пример #2
0
def upload_single_edge(e):
    res = None
    error = None
    retry = 0
    while res is None:
        try:
            graph_upload_request = graphUpload_pb2.GraphUploadRequest()
            node_from = e[0]
            node_to = e[1]
            edge_type = e[2]

            if edge_type == 0:
                edge = graph_upload_request.graph.edges.add()
                edge.props.type = "HasElement"
                edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                    node_from)
                edge.endNodeID.url = "https://zh.wikipedia.org/wiki/" + quote_plus(
                    node_to)
            # categories edge
            else:
                if node_from in IGNORE_CATEGORIES:
                    break
                edge = graph_upload_request.graph.edges.add()
                edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                    node_from)
                edge.endNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                    node_to)
                edge.props.type = "HasSubset"
            graph_upload_request.uploadTag = "uploadWikiEdge"
            graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                'UPDATE')
            graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                'UPDATE')
            res = gs_call.upload_graph(graph_upload_request)
            print(res)
        except HTTPError as e:
            if e.code in RETRIABLE_STATUS_CODES:
                error = 'A retriable HTTP error %d occurred:\n%s' % (e.code,
                                                                     e.reason)
            else:
                raise
        except RETRIABLE_EXCEPTIONS as e:
            error = 'A retriable error occurred: %s' % e
        except GRAPH_EXCEPTIONS as e:
            break

        if error is not None:
            print(error)
            retry += 1
            res = None
            if retry > MAX_RETRIES:
                ylog.debug(res)
                exit("no loger attempting to retry.")
            max_sleep = 2**retry
            sleep_seconds = random.random() * max_sleep
            print('Sleeping %f seconds and then retrying...' % sleep_seconds)
            time.sleep(sleep_seconds)
    try:
        if res.edgeUpdateResultStatistics:
            ylog.debug(res.edgeUpdateResultStatistics)
            uploaded_number = res.edgeUpdateResultStatistics.numOfCreations + \
                res.edgeUpdateResultStatistics.numOfUpdates + \
                res.edgeUpdateResultStatistics.numOfSkips
            ylog.debug(e)
        # if res.failedEdges:
        #     for err in res.failedEdges:
        #         print(err)
        #         print(
        #             "start node: %s" % err.edge.startNodeID.primaryKeyInDomain)
        #         print("end node: %s" % err.edge.endNodeID.primaryKeyInDomain)
    except:
        pass
Пример #3
0
def upload_node(dict_re_match_object):
    """ upload regular expression object in the dictionary in a batch.
    1. get each value from the input dictionary.
    2. create a graph upload request.
    3. fill node properties.
    use encoded original Chinese title plus url as url property.
    4. if there's any error upload response, retry.
    5. print upload statistics.
    Keyword Arguments:
    re_match_object -- re object
    """
    res = None
    error = None
    re_upload_error = None
    retry = 0
    nodes_fail_retry = 0
    uploaded_number = 0
    while res is None:
        try:
            graph_upload_request = graphUpload_pb2.GraphUploadRequest()
            # iterate nodes batch
            for index, value in dict_re_match_object.items():
                if value is not None:
                    item = dict_re_match_object.get(index)
                    # print(item)
                    title = item.group()[1:-1]
                    zh_title = HanziConv.toSimplified(title)
                    # if zh_title in IGNORE_CATEGORIES:
                    #     break
                    node = graph_upload_request.graph.nodes.add()
                    node.props.type = "readonlyDoc"
                    # p1 = node.props.props.entries.add()
                    # p1.key = "url"
                    # p1.value = "https://www.google.com.hk/search?hl=en&source=hp&q=" + quote_plus(
                    #     title)
                    p2 = node.props.props.entries.add()
                    p2.key = "_s_import_source"
                    p2.value = "word2vec model"

                    node.businessID.url = "https://www.google.com.hk/search?hl=en&source=hp&q=" + quote_plus(
                        title)
                    node.names.chinese = zh_title

            # other information of the upload request
            graph_upload_request.uploadTag = "UploadWord2VecVocabNodes"
            graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                'UPDATE')
            graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                'UPDATE')
            res = gs_call.upload_graph(graph_upload_request)
        except HTTPError as e:
            if e.code in RETRIABLE_STATUS_CODES:
                error = 'A retriable HTTP error %d occurred:\n%s' % (e.code,
                                                                     e.reason)
            else:
                raise
        except RETRIABLE_EXCEPTIONS as e:
            error = 'A retriable error occurred: %s' % e
        try:
            if res.failedNodes:
                re_upload_error = "some nodes failed to upload %s" % res.failedNodeds
        except:
            pass
        if re_upload_error is not None:
            print(re_upload_error)
            nodes_fail_retry += 1
            res = None
            if nodes_fail_retry > NODES_FAIL_MAX_RETRIES:
                ylog.debug(res)
                res = "continue"

        if error is not None:
            print(error)
            retry += 1
            res = None
            if retry > MAX_RETRIES:
                ylog.debug(res)
                # break
                # exit("no loger attempting to retry.")
            ylog.debug(res)
            max_sleep = 2**retry
            sleep_seconds = random.random() * max_sleep
            print('Sleeping %f seconds and then retrying...' % sleep_seconds)
            time.sleep(sleep_seconds)
    # ylog.debug(res)
    # jump out while response is None:
    try:
        if res.nodeUpdateResultStatistics:
            ylog.debug(res.nodeUpdateResultStatistics)
            uploaded_number = res.nodeUpdateResultStatistics.numOfCreations + \
                res.nodeUpdateResultStatistics.numOfUpdates + \
                res.nodeUpdateResultStatistics.numOfSkips
        if res.uploadedNodes:
            for updated in res.uploadedNodes:
                ylog.debug("uploaded node GID: %s" % updated.gid)
        if res.failedNodes:
            for err in res.failedNodes:
                if err.error.errorCode != 202001:
                    ylog.info(err.error)
                    ylog.debug(err.error)
    except:
        pass

    return uploaded_number
Пример #4
0
def upload_edge_from_graph(ls_edges, batch_size):
    """ upload edge regular expression object in the dictionary in a batch.
    1. get each value from the input dictionary.
    2. create a graph upload request.
    3. fill edge properties.
    set edge start node and end node.
    4. if there's any error upload response, retry.
    5. print upload statistics.
    (9,'En-3_使用者','MOUNTAIN','2015-09-02 13:44:06','','uppercase','page')
    (id, from, to,...)
    Keyword Arguments:
    re_match_object -- re object
    """
    """upload edge one by one
    Parameters:
    ls_edges -- list of edge tuples
    """
    len_edges = len(ls_edges)
    uploaded_number = 0
    batch_counter = 0
    for edge_counter in tqdm(range(0, len_edges, batch_size)):

        res = None
        error = None
        re_upload_error = None
        retry = 0
        nodes_fail_retry = 0
        graph_upload_request = graphUpload_pb2.GraphUploadRequest()
        while res is None:
            try:
                graph_upload_request = graphUpload_pb2.GraphUploadRequest()
                for e in ls_edges[batch_counter:batch_counter + batch_size]:
                    node_from = e[0]
                    node_to = e[1]
                    edge_type = e[2]

                    # page edge
                    if edge_type == 0:
                        edge = graph_upload_request.graph.edges.add()
                        edge.props.type = "HasElement"
                        edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            node_from)
                        edge.endNodeID.url = "https://zh.wikipedia.org/wiki/" + quote_plus(
                            node_to)
                # categories edge
                    else:
                        if node_from in IGNORE_CATEGORIES:
                            continue
                        edge = graph_upload_request.graph.edges.add()
                        edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            node_from)
                        edge.endNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            node_to)
                        edge.props.type = "HasSubset"
                graph_upload_request.uploadTag = "uploadWikiEdge"
                graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                    'UPDATE')
                graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                    'UPDATE')
                res = gs_call.upload_graph(graph_upload_request)

            except HTTPError as e:
                if e.code in RETRIABLE_STATUS_CODES:
                    error = 'A retriable HTTP error %d occurred:\n%s' % (
                        e.code, e.reason)
                else:
                    raise
            except RETRIABLE_EXCEPTIONS as e:
                error = 'A retriable error occurred: %s' % e
            except GRAPH_EXCEPTIONS as e:
                ylog.debug('A graph error occurred: %s' % e)
                break
            if error is not None:
                print(error)
                retry += 1
                res = None
                if retry > MAX_RETRIES:
                    ylog.debug(res)
                    # exit("no loger attempting to retry.")
                    ylog.debug("no loger attempting to retry.")
                    error = None
                max_sleep = 2**retry
                sleep_seconds = random.random() * max_sleep
                print('Sleeping %f seconds and then retrying...' %
                      sleep_seconds)
                time.sleep(sleep_seconds)
        try:
            if res.edgeUpdateResultStatistics:
                ylog.debug(res.edgeUpdateResultStatistics)
                number = res.edgeUpdateResultStatistics.numOfCreations + \
                    res.edgeUpdateResultStatistics.numOfUpdates + \
                    res.edgeUpdateResultStatistics.numOfSkips
                uploaded_number += number
            if res.failedEdges:
                for err in res.failedEdges:
                    print(err)
                    print("start node: %s" %
                          err.edge.startNodeID.primaryKeyInDomain)
                    print("end node: %s" %
                          err.edge.endNodeID.primaryKeyInDomain)
        except:
            pass
        batch_counter += batch_size

    return uploaded_number
Пример #5
0
def upload_edge(dict_re_match_object):
    """ upload edge regular expression object in the dictionary in a batch.
    1. get each value from the input dictionary.
    2. create a graph upload request.
    3. fill edge properties.
    set edge start node and end node.
    4. if there's any error upload response, retry.
    5. print upload statistics.
    (9,'En-3_使用者','MOUNTAIN','2015-09-02 13:44:06','','uppercase','page')
    (id, from, to,...)
    Keyword Arguments:
    re_match_object -- re object
    """
    res = None
    error = None
    re_upload_error = None
    retry = 0
    nodes_fail_retry = 0
    uploaded_number = 0
    while res is None:
        try:
            graph_upload_request = graphUpload_pb2.GraphUploadRequest()
            # iterate nodes batch
            for index, value in dict_re_match_object.items():
                if value is not None:
                    item = dict_re_match_object.get(index)
                    edge_type = item.group(7)[1:-1]
                    if edge_type == 'page':
                        page_title = item.group(3)[1:-1]
                        cat_title = item.group(2)[1:-1]
                        if '\\n' in cat_title:
                            end = cat_title.split("\\n")
                            cat_title = end[-1]
                        if '\\n' in page_title:
                            end = page_title.split("\\n")
                            page_title = end[-1]
                        page_title = page_title.replace(" ", "_")
                        page_title_zh = HanziConv.toSimplified(page_title)
                        cat_title_zh = HanziConv.toSimplified(cat_title)
                        # if not cat_title_zh in EXAMPLE_CATEGORIES_PAGE_DICT:
                        #     continue

                        edge = graph_upload_request.graph.edges.add()
                        edge.props.type = "HasElement"
                        edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            cat_title)
                        edge.endNodeID.url = "https://zh.wikipedia.org/wiki/" + quote_plus(
                            page_title)

                    if edge_type == 'subcat':
                        subcat_title = item.group(3)[1:-1]
                        cat_title = item.group(2)[1:-1]
                        if '\\n' in cat_title:
                            end = cat_title.split("\\n")
                            cat_title = end[-1]
                        if '\\n' in subcat_title:
                            end = subcat_title.split("\\n")
                            subcat_title = end[-1]
                        subcat_title = subcat_title.replace(" ", "_")
                        subcat_title_zh = HanziConv.toSimplified(subcat_title)
                        cat_title_zh = HanziConv.toSimplified(cat_title)

                        # if not cat_title_zh in EXAMPLE_CATEGORIES_PAGE_DICT:
                        #     continue
                        if subcat_title_zh == cat_title_zh:
                            continue
                        edge = graph_upload_request.graph.edges.add()
                        edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            cat_title)
                        edge.endNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            subcat_title)
                        edge.props.type = "HasSubset"

            graph_upload_request.uploadTag = "uploadWikiEdge"
            graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                'UPDATE')
            graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                'UPDATE')
            res = gs_call.upload_graph(graph_upload_request)
            # ylog.debug(res)
        except HTTPError as e:
            if e.code in RETRIABLE_STATUS_CODES:
                error = 'A retriable HTTP error %d occurred:\n%s' % (e.code,
                                                                     e.reason)
            else:
                raise
        except RETRIABLE_EXCEPTIONS as e:
            error = 'A retriable error occurred: %s' % e
        except GRAPH_EXCEPTIONS as e:
            break
        # try:
        #     if res.failedEdges:
        #         re_upload_error = "some nodes failed to upload %s" % res.failedEdges
        # except:
        #     pass
        # if re_upload_error is not None:
        #     print(re_upload_error)
        #     nodes_fail_retry += 1
        #     res = None
        #     if nodes_fail_retry > NODES_FAIL_MAX_RETRIES:
        #         ylog.debug(res)
        #         res = "continue"
        if error is not None:
            print(error)
            retry += 1
            res = None
            if retry > MAX_RETRIES:
                ylog.debug(res)
                ylog.debug("no loger attempting to retry.")
                error = None
                # exit("no loger attempting to retry.")
            max_sleep = 2**retry
            sleep_seconds = random.random() * max_sleep
            print('Sleeping %f seconds and then retrying...' % sleep_seconds)
            time.sleep(sleep_seconds)
    try:
        if res.edgeUpdateResultStatistics:
            ylog.debug(res.edgeUpdateResultStatistics)
            uploaded_number = res.edgeUpdateResultStatistics.numOfCreations + \
                res.edgeUpdateResultStatistics.numOfUpdates + \
                res.edgeUpdateResultStatistics.numOfSkips
        if res.failedEdges:
            for err in res.failedEdges:
                ylog.debug(err)
                ylog.debug("start node: %s" %
                           err.edge.startNodeID.primaryKeyInDomain)
                ylog.debug("end node: %s" %
                           err.edge.endNodeID.primaryKeyInDomain)
    except:
        pass

    return uploaded_number
Пример #6
0
def upload_edge(ls_edges):
    """upload edge one by one
    Parameters:
    ls_edges -- list of edge tuples
    """
    len_edges = len(ls_edges)
    uploaded_number = 0
    batch_counter = 0
    for edge_counter in tqdm(range(0, len_edges, BATCH_SIZE)):

        res = None
        error = None
        retry = 0
        graph_upload_request = graphUpload_pb2.GraphUploadRequest()
        while res is None:
            try:
                graph_upload_request = graphUpload_pb2.GraphUploadRequest()
                for e in ls_edges[batch_counter:batch_counter + BATCH_SIZE]:
                    node_from = e[0]
                    node_to = e[1]
                    edge_type = e[2]

                    # page edge
                    if edge_type == 0:
                        edge = graph_upload_request.graph.edges.add()
                        edge.props.type = "HasElement"
                        edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            node_from)
                        edge.endNodeID.url = "https://zh.wikipedia.org/wiki/" + quote_plus(
                            node_to)
                # categories edge
                    else:
                        if node_from in IGNORE_CATEGORIES:
                            continue
                        edge = graph_upload_request.graph.edges.add()
                        edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            node_from)
                        edge.endNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            node_to)
                        edge.props.type = "HasSubset"
                graph_upload_request.uploadTag = "uploadWikiEdge"
                graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                    'UPDATE')
                graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                    'UPDATE')
                res = gs_call.upload_graph(graph_upload_request)

            except HTTPError as e:
                if e.code in RETRIABLE_STATUS_CODES:
                    error = 'A retriable HTTP error %d occurred:\n%s' % (
                        e.code, e.reason)
                else:
                    raise
            except RETRIABLE_EXCEPTIONS as e:
                error = 'A retriable error occurred: %s' % e
            except GRAPH_EXCEPTIONS as e:
                ylog.debug('A graph error occurred: %s' % e)
                break
            if error is not None:
                print(error)
                retry += 1
                res = None
                if retry > MAX_RETRIES:
                    ylog.debug(res)
                    exit("no loger attempting to retry.")
                max_sleep = 2**retry
                sleep_seconds = random.random() * max_sleep
                print('Sleeping %f seconds and then retrying...' %
                      sleep_seconds)
                time.sleep(sleep_seconds)
        try:
            if res.edgeUpdateResultStatistics:
                ylog.debug(res.edgeUpdateResultStatistics)
                number = res.edgeUpdateResultStatistics.numOfCreations + \
                    res.edgeUpdateResultStatistics.numOfUpdates + \
                    res.edgeUpdateResultStatistics.numOfSkips
                uploaded_number += number
            if res.failedEdges:
                for err in res.failedEdges:
                    ylog.debug(err)
                    ylog.debug("start node: %s" %
                               err.edge.startNodeID.primaryKeyInDomain)
                    ylog.debug("end node: %s" %
                               err.edge.endNodeID.primaryKeyInDomain)
        except:
            pass
        batch_counter += BATCH_SIZE

    return uploaded_number
Пример #7
0
            pk_str = "https://zh.wikipedia.org/wiki/Category:" + '/' + item_str
        else:
            pk_str = "https://zh.wikipedia.org/wiki/" + '/' + item_str
        pk_md5 = hashlib.md5(pk_str.encode('utf-8')).hexdigest().upper()
        print(pk_md5)


print_mdf5(wiki_category, 1, category=True)
print_mdf5(wiki_page, 2, category=False)

category_link_size = len(wiki_category_link)
del wiki_category_link
last_span = wiki_category_link_re.search(category_link).span()[0]
item = wiki_category_link_re.search(category_link, last_span)
last_span = wiki_category_link_re.search(category_link, last_span).span()[1]
graph_upload_request = graphUpload_pb2.GraphUploadRequest()
edge = graph_upload_request.graph.edges.add()
# edge from the first node to the second node
if item.group(7)[1:-1] == 'page':
    page_id = int(item.group(1))
    page_title = item.group(3)[1:-1]
    cat_title = item.group(2)[1:-1]
    if '\\n' in cat_title:
        end = cat_title.split("\\n")
        cat_title = end[-1]
    if '\\n' in page_title:
        end = page_title.split("\\n")
        page_title = end[-1]
    edge.props.type = "HasElement"

    edge.startNodeID.domain = "https://zh.wikipedia.org/wiki/Category:"