def getDocBOWannotatedSections(doc, parameters=None, doctext=None): """ Returns a dict where each key should be a field for the document in the Lucene index returns {"title","abstract","text"} """ paragraphs=[] res={} res["title"]=doc["metadata"]["title"] res["abstract"]="" if len(doc.allsections) > 0 : try: res["abstract"]=doc.getSectionText(doc.allsections[0],False) res["abstract"]==removeCitations(res["abstract"]).lower() except: res["abstract"]=u"<UNICODE ERROR>" paper_text="" for index in range(1,len(doc.allsections),1): paper_text+=" "+doc.getSectionText(doc.allsections[index],False) res["text"]=removeCitations(paper_text).lower() addDocBOWFullTextField(doc,res,doctext) return {1:[res]}
def processContext(match,doctext, wleft, wright): """ Optimize the extraction of context by limiting the amount of characters and pre-tokenizing """ left_start=max(0,match.start()-((wleft+15)*ESTIMATED_AVERAGE_WORD_LENGTH)) right_end=match.end()+((wright+15)*ESTIMATED_AVERAGE_WORD_LENGTH) left=doctext[left_start:match.start()] # tokenize! left=tokenizeText(removeCitations(left)) left=removePunctuation(left) right=doctext[match.end():right_end] # tokenize! right=tokenizeText(removeCitations(right)) right=removePunctuation(right) return {"left":left,"right":right,"left_start":left_start,"right_end":right_end}
def getDocBOWpassagesMulti(doc, parameters=[100], doctext=None): """ Get BOW for document using full text minus references and section titles Args: doc: full SciDoc to get text for Returns: multiple BOWs in a dictionary where the keys are the parameters """ if not doctext: doctext=doc.getFullDocumentText(doc, headers=True) doctext=removeCitations(doctext).lower() tokens=tokenizeText(doctext) res={} for param in parameters: res[param]=[] for i in xrange(0, len(tokens), param): bow={"text":unTokenize(tokens[i:i+param])} res[param].append(bow) for i in xrange(param/2, len(tokens), param): bow={"text":unTokenize(tokens[i:i+param])} res[param].append(bow) return res
def addDocBOWFullTextField(doc,res_dict,doctext=None): """ Adds the _full_text field """ if not doctext: doctext=doc.getFullDocumentText(doc) doctext=removeCitations(doctext).lower() tokens=tokenizeText(doctext) res_dict["_full_text"]=unTokenize(tokens)
def getDocBOWfull(doc, parameters=None, doctext=None): """ Get BOW for document using full text minus references and section titles Args: doc: full SciDoc to get text for """ if not doctext: doctext=doc.getFullDocumentText(doc) doctext=removeCitations(doctext).lower() tokens=tokenizeText(doctext) new_doc={"text":unTokenize(tokens)} return {1:[new_doc]} # all functions must take a list of parameters and return dict[parameter]=list of BOWs
def getDocBOWTitleAbstract(doc, parameters=None, doctext=None): """ Get BOW for document made up of only title and abstract """ paragraphs=[] doctext=doc["metadata"]["title"]+". " if len(doc.allsections) > 0: try: doctext+=" " + doc.getSectionText(doc.allsections[0],False) except: doctext+=u"<UNICODE ERROR>" doctext=removeCitations(doctext).lower() tokens=tokenizeText(doctext) return {1:[{"text":unTokenize(tokens)}]}