示例#1
0
    def processBOW(self, text):
        """
            Takes a list of strings, returns a set of tokens.
        """
        text=" ".join(text)
        text=text.lower()
        text=re.sub(r"[\.,;\-\"]", " ", text)
        tokens=tokenizeText(text)
##        tokens=text.lower().split()
        tokens=[token for token in tokens if token not in punctuation and token not in basic_stopwords]
        return set(tokens)
示例#2
0
def getOutlinkContextAtharAnnotated(context):
    """
        Returns a context as annotated: list of tokens
    """
    tokens=[]
    for line in context["lines"]:
        sent=line["sentiment"]
        if sent and ("p" in sent or "n" in sent or "o" in sent or "c" in sent):
            clean_line=removeURLs(line["line"]).replace(CIT_MARKER,"")
            clean_line=removeACLCitations(clean_line)
            tokens.extend(tokenizeText(clean_line))
    tokens=[token for token in tokens if token not in punctuation]
    return tokens
示例#3
0
def getOutlinkContextAtharWindowOfWords(context, left, right):
    """
        Returns a window-of-words context: list of tokens
    """
    context_text="".join([line["line"] for line in context["lines"]])
    # remove URLS in text (normally footnotes and conversion erros)
    context_text=removeURLs(context_text)
    context_text=removeACLCitations(context_text)
    tokens=tokenizeText(context_text)
    tokens=[token for token in tokens if token not in punctuation]
    for index,token in enumerate(tokens):
        if token==CIT_MARKER:
            res=[]
            res.extend(tokens[index-left:index])
            res.extend(tokens[index+1:index+right+1])
            return res
    return None