def generateDocBOW_ILC_Annotated(doc_incoming, parameters, doctext=None, full_paragraph=True): """ Create a BOW from all the inlink contexts of a given document Extracts sentences around the citation, annotated with their AZ Args: doc_incoming: for compatibility, SciDoc or dict with .metadata["guid"] parameters = {"full_paragraph":True,"sent_left":1, "sent_right":1}? """ doc_incoming_guid=doc_incoming.metadata["guid"] all_contexts=defaultdict(lambda:[]) for param in parameters: all_contexts[param]=[] doc_metadata=cp.Corpus.getMetadataByGUID(doc_incoming_guid) print("Building VSM representations for ", doc_metadata["guid"], ":", len(doc_metadata["inlinks"]), "incoming links") for inlink_guid in doc_metadata["inlinks"]: # loads from cache if exists, XML otherwise docfrom=cp.Corpus.loadSciDoc(inlink_guid) ## cp.Corpus.annotateDoc(docfrom,["AZ"]) # important! the doctext here has to be that of the docfrom, NOT doc_incoming doctext=docfrom.getFullDocumentText() ref_id=identifyReferenceLinkIndex(docfrom, doc_incoming_guid) print("Document with incoming citation links loaded:", docfrom["metadata"]["filename"]) for param in parameters: citations=[cit for cit in docfrom["citations"] if cit["ref_id"] == ref_id] for cit in citations: to_add=selectSentencesToAdd(docfrom,cit,param) context={"ilc_AZ_"+zone:"" for zone in AZ_ZONES_LIST} for zone in CORESC_LIST: context["ilc_CSC_"+zone]="" for sent_id in to_add: sent=docfrom.element_by_id[sent_id] text=formatSentenceForIndexing(sent) if sent.get("az","") != "": context["ilc_AZ_"+sent.get("az","")]+=" "+text if "csc_type" not in sent: sent["csc_type"]="Bac" context["ilc_CSC_"+sent["csc_type"]]+=" "+text context["guid_from"]=docfrom["metadata"]["guid"] context["year_from"]=docfrom["metadata"]["year"] all_contexts[param].append(context) # this bit of code makes every entry a list for multiple representations from each document ## for c in all_contexts: ## all_contexts[c]=[all_contexts[c]] return all_contexts