def serialize_doc(doc: Doc) -> Dict[str, any]: dep = [] return { 'text': doc.text, 'tokens': list(map(serialize_token, doc.__iter__())), 'noun_chunks': list(map(serialize_span, doc.noun_chunks)), 'data': doc.to_json(), # 'dep': dep, }
def encode(cls, obj: Doc) -> str: """Encode the Doc object. Args: obj: Returns: """ # JSON dump the Doc doc_json = obj.to_json() if obj._.has("huggingface_neuralcoref"): # Create a helper function that turns a Span into a dictionary span_to_dict = lambda span: { "start": span.start, "end": span.end, "text": span.text, } # Create a helper function that converts a Cluster (output of # neuralcoref) into a dictionary cluster_to_dict = lambda cluster: { "i": cluster.i, "main": span_to_dict(cluster.main), "mentions": [span_to_dict(span) for span in cluster.mentions], } # Apply the helper functions to construct a dictionary for the # neuralcoref information neuralcoref_dict = { "neuralcoref": [cluster_to_dict(cluster) for cluster in obj._.coref_clusters] } # Combine the neuralcoref dictionary with the doc_json doc_json = tz.merge(doc_json, neuralcoref_dict) # Convert the Spacy Doc to json before caching return json.dumps(doc_json)