def processBOW(self, text): """ Takes a list of strings, returns a set of tokens. """ text=" ".join(text) text=text.lower() text=re.sub(r"[\.,;\-\"]", " ", text) tokens=tokenizeText(text) ## tokens=text.lower().split() tokens=[token for token in tokens if token not in punctuation and token not in basic_stopwords] return set(tokens)
def getOutlinkContextAtharAnnotated(context): """ Returns a context as annotated: list of tokens """ tokens=[] for line in context["lines"]: sent=line["sentiment"] if sent and ("p" in sent or "n" in sent or "o" in sent or "c" in sent): clean_line=removeURLs(line["line"]).replace(CIT_MARKER,"") clean_line=removeACLCitations(clean_line) tokens.extend(tokenizeText(clean_line)) tokens=[token for token in tokens if token not in punctuation] return tokens
def getOutlinkContextAtharWindowOfWords(context, left, right): """ Returns a window-of-words context: list of tokens """ context_text="".join([line["line"] for line in context["lines"]]) # remove URLS in text (normally footnotes and conversion erros) context_text=removeURLs(context_text) context_text=removeACLCitations(context_text) tokens=tokenizeText(context_text) tokens=[token for token in tokens if token not in punctuation] for index,token in enumerate(tokens): if token==CIT_MARKER: res=[] res.extend(tokens[index-left:index]) res.extend(tokens[index+1:index+right+1]) return res return None