Exemplo n.º 1
0
def read_json_input(input_data_path):
    """Reads json data with this format:

  [{"URL": "http://example.com/path.html", "Contents": "Text content of webpage here"},
   {"URL": "http://example.com/path.html", "Contents": "Text content of webpage here"}]"""
    if input_data_path == '-':
        data = sys.stdin.read()
    else:
        with open(input_data_path, 'r') as input_json_file:
            data = input_json_file.read()
    corpus = json.loads(data)
    result = page_view_sequence_pb2.DataSetProto()
    for item in corpus:
        page_content_proto = page_view_sequence_pb2.PageContentProto()
        codepoints = set()
        for code_point in item["Contents"]:
            codepoints.add(ord(code_point))
        for code_point in codepoints:
            page_content_proto.codepoints.append(code_point)
        page_view_proto = page_view_sequence_pb2.PageViewProto()
        page_view_proto.contents.append(page_content_proto)
        page_view_sequence = page_view_sequence_pb2.PageViewSequenceProto()
        page_view_sequence.page_views.append(page_view_proto)
        result.sequences.append(page_view_sequence)
    return result
Exemplo n.º 2
0
def main(argv):
    """Takes 1 or more file paths and converts each into a page view."""
    data_set = page_view_sequence_pb2.DataSetProto()
    sequence = page_view_sequence_pb2.PageViewSequenceProto()
    for file_path in argv[1:]:
        sequence.page_views.append(create_page_view(file_path))
    data_set.sequences.append(sequence)

    print(text_format.MessageToString(data_set))
Exemplo n.º 3
0
def read_text_input(input_data_path):
    """Reads text proto data."""
    if input_data_path == '-':
        text_input = sys.stdin.read()
    else:
        with open(input_data_path, 'r') as input_data_file:
            text_input = input_data_file.read()
    data_set = page_view_sequence_pb2.DataSetProto()
    text_format.Parse(text_input, data_set)
    return data_set
Exemplo n.º 4
0
def read_text_input(input_data_path):
  data_set = page_view_sequence_pb2.DataSetProto()
  with open(input_data_path, 'r') as input_data_file:
    text_format.Merge(input_data_file.read(), data_set)
  return data_set