Пример #1
0
def read_json_input(input_data_path):
    """Reads json data with this format:

  [{"URL": "http://example.com/path.html", "Contents": "Text content of webpage here"},
   {"URL": "http://example.com/path.html", "Contents": "Text content of webpage here"}]"""
    if input_data_path == '-':
        data = sys.stdin.read()
    else:
        with open(input_data_path, 'r') as input_json_file:
            data = input_json_file.read()
    corpus = json.loads(data)
    result = page_view_sequence_pb2.DataSetProto()
    for item in corpus:
        page_content_proto = page_view_sequence_pb2.PageContentProto()
        codepoints = set()
        for code_point in item["Contents"]:
            codepoints.add(ord(code_point))
        for code_point in codepoints:
            page_content_proto.codepoints.append(code_point)
        page_view_proto = page_view_sequence_pb2.PageViewProto()
        page_view_proto.contents.append(page_content_proto)
        page_view_sequence = page_view_sequence_pb2.PageViewSequenceProto()
        page_view_sequence.page_views.append(page_view_proto)
        result.sequences.append(page_view_sequence)
    return result
Пример #2
0
def main(argv):
    """Takes 1 or more file paths and converts each into a page view."""
    data_set = page_view_sequence_pb2.DataSetProto()
    sequence = page_view_sequence_pb2.PageViewSequenceProto()
    for file_path in argv[1:]:
        sequence.page_views.append(create_page_view(file_path))
    data_set.sequences.append(sequence)

    print(text_format.MessageToString(data_set))
Пример #3
0
def pv_sequence(a_sequence):
    page_view_sequence = page_view_sequence_pb2.PageViewSequenceProto()
    page_view_sequence.page_views.extend(a_sequence)
    page_view_sequence.id = 42
    return page_view_sequence