class TokenizingEmbeddingVectorizer:
    """
    This vectorizer first tokenizes its input.
    """

    def __init__(self, embedding_file, ignored_tokens=set()):

        self.vectorizer = EmbeddingVectorizer(embedding_file, ignored_tokens)
        self.tokenizer = TreebankWordTokenizer()
        self.embeddings = self.vectorizer.embeddings
        self.token2Index = self.vectorizer.token2Index

    def tokenize_sentences(self, sentences):
        tokenized_sentences = list(map(lambda sentence: " ".join(self.tokenizer.tokenize(sentence)), sentences))
        return tokenized_sentences

    def prepare_data(self, sentences, labels):

        prepared_sentences = list(map(lambda sentence: " ".join(self.tokenizer.tokenize(sentence)), sentences))
        
        return self.vectorizer.prepare_data(prepared_sentences, labels)

    def sentences_to_padded_indices(self, sentences, max_length, padding="pre"):
        return self.vectorizer.sentences_to_padded_indices(sentences, max_length, padding)

    def sentences_to_indices(self, sentences):
        return np.array(self.vectorizer.sentences_to_indices(sentences))
class TreebankSpanTokenizer(TreebankWordTokenizer):
    def __init__(self):
        self._word_tokenizer = TreebankWordTokenizer()

    def span_tokenize(self, text):
        ix = 0
        for word_token in self.tokenize(text):
            ix = text.find(word_token, ix)
            end = ix + len(word_token)
            yield (ix, end)
            ix = end

    def tokenize(self, text, withSpans=False):
        tokens = self._word_tokenizer.tokenize(text)

        if not withSpans:
            return tokens

        spans = []
        ix = 0
        for word_token in tokens:
            ix = text.find(word_token, ix)
            end = ix + len(word_token)
            spans.append((ix, end))
            ix = end

        return zip(tokens, spans)
示例#3
0
 def get_tf_idf_score(self, sentence, mode, ngram=1):
     if ngram not in range(1, 4):
         try:
             raise ValueError
         except ValueError as v:
             print "Only unigrams, bigrams and trigrams are supported."
     if mode != "lex" and mode != "pos":
         try:
             raise ValueError
         except ValueError as v:
             print "Only lexical and POS distinctness supported."
     if len(self.document_freqs_lex.keys()) == 0 or len(
             self.document_freqs_pos.keys()) == 0:
         try:
             raise AttributeError
         except AttributeError as ae:
             print "Document frequency dictionaries not initialized. Call load_doc_freqs() " \
                   "on the LM object."
     tokenizer = TreebankWordTokenizer()
     sentence = sentence.lower()
     tokens = tokenizer.tokenize(sentence)
     tokens = self.__fix_tokens(tokens)
     tags = nltk.pos_tag(tokens)
     tags = self.__add_start_end_tags(tags)
     if mode == "lex":
         score = self.__get_lex_tf_idf(tags, ngram)
         return score
     else:
         score = self.__get_pos_tf_idf(tags, ngram)
         return score
示例#4
0
文件: lda.py 项目: 0077cc/NewsBlur
def create_data(stories, lang="english", doc_limit=-1, delimiter=""):
  from nltk.tokenize.treebank import TreebankWordTokenizer
  tokenizer = TreebankWordTokenizer()

  from nltk.corpus import stopwords
  stop = stopwords.words('english')
  
  from string import ascii_lowercase
  
  docs = {}
  print("Found %i stories" % stories.count())
  for story in stories:
    text = zlib.decompress(story.story_content_z)
    # text = story.story_title
    text = ''.join(BeautifulSoup(text).findAll(text=True)).lower()
    if delimiter:
      sections = text.split(delimiter)
    else:
      sections = [text]
            
    if doc_limit > 0 and len(docs) > doc_limit:
      print("Passed doc limit %i" % len(docs))
      break
    print(story.story_title, len(sections))

    for jj in xrange(len(sections)):
      docs["%s-%i" % (story.story_title, jj)] = [x for x in tokenizer.tokenize(sections[jj]) \
                                  if (not x in stop) and \
                                  (min(y in ascii_lowercase for y in x))]
  return docs
示例#5
0
def create_data(stories, lang="english", doc_limit=-1, delimiter=""):
    from nltk.tokenize.treebank import TreebankWordTokenizer
    tokenizer = TreebankWordTokenizer()

    from nltk.corpus import stopwords
    stop = stopwords.words('english')

    from string import ascii_lowercase

    docs = {}
    print("Found %i stories" % stories.count())
    for story in stories:
        text = zlib.decompress(story.story_content_z)
        # text = story.story_title
        text = ''.join(
            BeautifulSoup(text, features="lxml").findAll(text=True)).lower()
        if delimiter:
            sections = text.split(delimiter)
        else:
            sections = [text]

        if doc_limit > 0 and len(docs) > doc_limit:
            print("Passed doc limit %i" % len(docs))
            break
        print(story.story_title, len(sections))

        for jj in xrange(len(sections)):
            docs["%s-%i" % (story.story_title, jj)] = [x for x in tokenizer.tokenize(sections[jj]) \
                                        if (not x in stop) and \
                                        (min(y in ascii_lowercase for y in x))]
    return docs
示例#6
0
def tokenize_text(text, language="english"):
    '''Tokenize a string into a list of tokens.
    Use NLTK's Treebankwordtokenizer.
    Note that we first split into sentences using NLTK's sent_tokenize.
    We additionally call a filtering function to remove un-wanted tokens.
    
    IN:
    - text, str
    OUT:
    - list of strings
    '''
    ## list of tokens
    list_tokens = []

    ## split text into sentences
    sentences = sent_tokenize(text, language=language)

    ## define the tokenizer
    tokenizer = TreebankWordTokenizer()
    ## loop over all sentences
    for sent in sentences:
        ## tokenize the sentence
        sent_tokenized = tokenizer.tokenize(sent)
        ## lowercase the tokens
        ## add tokens to list of tokens
        list_tokens += sent_tokenized
    list_tokens = filter_tokens(list_tokens)
    return list_tokens
示例#7
0
class Dictionary:
    def __init__(self, tokenizer_method: str = "TreebankWordTokenizer"):
        self.token2idx = {}
        self.tokenizer = None

        if tokenizer_method == "TreebankWordTokenizer":
            self.tokenizer = TreebankWordTokenizer()
        else:
            raise NotImplementedError(
                "tokenizer_method {} doesn't exist".format(tokenizer_method))

        self.add_token(UNK_TOKEN)  # Add UNK token

    def build_dictionary_from_captions(self, captions: List[str]):
        for caption in captions:
            tokens = self.tokenizer.tokenize(caption)
            for token in tokens:
                self.add_token(token)

    def size(self) -> int:
        return len(self.token2idx)

    def add_token(self, token: str):
        if token not in self.token2idx:
            self.token2idx[token] = len(self.token2idx)

    def lookup_token(self, token: str) -> int:
        if token in self.token2idx:
            return self.token2idx[token]
        return self.token2idx[UNK_TOKEN]
示例#8
0
def text2sentences(path):
    # feel free to make a better tokenization/pre-processing
    sentences = []
    tokenizer = TreebankWordTokenizer()
    with open(path , encoding = 'utf8') as f:
        for l in f:
            table = str.maketrans(dict.fromkeys(string.punctuation + '0123456789')) #to remove numbers & punctuation
            sentences.append( tokenizer.tokenize(l.translate(table).lower()) )
    return sentences
示例#9
0
class TextPreprocessor:
    _WRONG_CHAR_FILTER = re.compile(
        '[' + ''.join([chr(i) for i in range(0, 0x0a)]) +
        ''.join([chr(i) for i in range(0x0b, 0x20)]) +
        ''.join([chr(i) for i in range(0x80, 0x9f)]) + ']')
    _PARAGRAPH_FILTER = re.compile('\n\n')

    def __init__(self,
                 sentence_tokenizer: Any = None,
                 paragraph_threshold: int = 150):
        """
        Constructor
        :param sentence_tokenizer: a sentences_tokenizer that provide a tokenize(t:str)->[str] method
        (for instance: nltk.data.load('tokenizers/punkt/english.pickle'))
        :param paragraph_threshold: the minimum number of characters of paragraph should contains (it will be
        filtered otherwise)
        """
        self.sentences_tokenizer = sentence_tokenizer  # might wanna use
        self.paragraph_threshold = paragraph_threshold
        self._word_tokenizer = TreebankWordTokenizer()

    def clean_text(self, text: str) -> str:
        return self._WRONG_CHAR_FILTER.sub(" ", text)

    def split_to_paragraphs(self, text: str) -> List[str]:
        return re.split(self._PARAGRAPH_FILTER, text)

    def filter_paragraphs(self, paragraphs: List[str]) -> List[str]:
        return list(
            filter(lambda x: len(x) > self.paragraph_threshold, paragraphs))

    '''
    def split_to_sentences(self, text: str) -> Iterable[str]:
        if self.sentences_tokenizer is None:
            raise AttributeError("No tokenizer has been set")

        return self.sentences_tokenizer.tokenize(text)
    '''

    def process_to_paragraphs(self, text: str) -> List[str]:
        r = text

        r = self.clean_text(r)
        r = self.split_to_paragraphs(r)
        r = self.filter_paragraphs(r)

        return r

    @property
    def word_tokenizer(self):
        return self._word_tokenizer

    def tokenize(self, text: str) -> List[str]:
        return self._word_tokenizer.tokenize(text)

    def count_words(self, text: str) -> int:
        return len(self.tokenize(text))
def treebank_tokenizer(sentence):
    # split 's but also split <>, wait to use in further work
    t = TreebankWordTokenizer()
    word_lst = t.tokenize(sentence.lower().replace("<", "LAB_").replace(
        ">", "_RAB"))
    ret = []
    for w in word_lst:
        ret.append(w.replace("LAB_", "<").replace("_RAB", ">"))
    return ret
def read(fn, test_percentage, maxlen, max_features, dataset_type):
    """
    :param fn: dataset filename.
    :param maxlen: maximum length for each sentence.
    :param max_features: max_features (e.g., unique words, vocabulary)
    :param padding: If true, padding will be made starting and ending of each sentence.
    :return:
    """

    tokenizer = TreebankWordTokenizer()
    c = count(2)
    word_idx = {}
    try:
        lines = codecs.open(fn, encoding='utf8').read().splitlines()
    except UnicodeDecodeError:
        lines = codecs.open(fn).read().splitlines()
    y = []
    X = []
    for line in lines:
        try:
            label, sentence = line.split('\t')
        except ValueError:
            continue
        y.append(label)
        s = []
        for token in tokenizer.tokenize(sentence):
            idx = word_idx.get(token, None)
            if idx is None:
                idx = c.next()
                if idx < max_features:
                    word_idx[token] = idx
                else:
                    idx = 1
            s.append(idx)
        X.append(s)

    X = sequence.pad_sequences(X, maxlen=maxlen)
    num_instance_for_train = int(len(X) * (1 - test_percentage))

    # convert labels into floats if the labels are real-valued.
    if dataset_type == 'regression':
        y = map(lambda e: float(e), y)
    else:
        set_y = set(y)
        print >> sys.stderr, set_y
        label1, label2 = set_y  # now supporting only binary classification.
        labels = {label1: 0, label2: 1}
        y = map(lambda e: labels[e], y)  # map labels 0/1.

    y = np.array(y)

    print "training set size {}, test set size {}".format(num_instance_for_train,
                                                          max(len(X) - num_instance_for_train, 0))

    return (X[:num_instance_for_train, :], y[:num_instance_for_train]), (X[num_instance_for_train:, :],
                                                                         y[num_instance_for_train:]), word_idx
示例#12
0
class Tagger(object):
    def __init__(self, settings):
        self.database = settings.get('DATABASE')
        nltk.download('punkt')
        self.sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
        self.word_tokenizer = TreebankWordTokenizer()
        
    def make_tagging(self):
        engine = create_engine(URL(**self.database), poolclass=NullPool)
        session = sessionmaker(bind=engine)()
        
        DeclarativeBase.metadata.create_all(engine, checkfirst=True)
        
        logger.warn('clearing tokens table')
        session.query(Token).delete() # remove old tokens # .filter(Token.project_id == project.id)
        projects = session.query(Project.id, Project.text).all()
        tokens = []
        
        BATCH_SIZE = 2000
        batch_row = 1
        for project in projects:
            if project.text is None:
                logger.warn('project #{} has empty descritpion, unable to tokenize'.format(project.id))
                continue
            old_tagged = len(tokens)
            sents_coords = self.sent_tokenizer.span_tokenize(project.text)
            for i in range(len(sents_coords) - 1):
                tokens.append(Token(project_id=project.id, start=sents_coords[i][1], end=sents_coords[i+1][0], pos='<EOS>'))
            if len(sents_coords) > 0:
                tokens.append(Token(project_id=project.id, start=sents_coords[-1][1], end=len(project.text), pos='<EOS>'))
            
            for sent_coords in sents_coords:
                sent = project.text[sent_coords[0]:sent_coords[1]]
                # TreebankWordTokenizer doesn't support span_tokenize yet.
                # The work is on the way, but currently we can only implement it by ourselves
                words = self.word_tokenizer.tokenize(sent)
                i = 0
                for word, pos in nltk.pos_tag(words):
                    i = sent.find(word, i)
                    if i >= 0:
                        tokens.append(Token(project_id=project.id, pos=pos,
                            start=(sent_coords[0] + i), end=(sent_coords[0] + i + len(word))))
            
            logger.warn('tagged {} tokens for project #{}'.format(len(tokens) - old_tagged, project.id))
            if batch_row % BATCH_SIZE == 0:
                logger.warn('committing batch')
                session.add_all(tokens)
                session.commit()
                tokens = []
            batch_row += 1

        
        logger.warn('committing the last batch')
        session.add_all(tokens)
        session.commit()
        session.close()
def english_tokenization(term):
  word_tokenizer = TreebankWordTokenizer()
  tokenized_term = ""

  for word in word_tokenizer.tokenize(term):
    if tokenized_term != "":
      tokenized_term += " "
    tokenized_term += word

  return tokenized_term
def transform_texts(art,
                    period,
                    site,
                    ngrams=1,
                    mod=None,
                    text_column='text',
                    text_token_column='text_token',
                    remain_columns=('author', 'site', 'link')):
    """Transform dataframe with texts, create tokenized lists in columns.
    Save dataframe to mod directory, if mod is not None."""
    text_column_paragraphs = text_column + '_paragraphs'
    text_token_column_lower = text_token_column + '_lower'
    text_token_column_stemmed = text_token_column + '_stemmed'
    text_token_column_count = text_token_column + '_count'

    st = SnowballStemmer('english')
    art.dropna(subset=[text_column],
               inplace=True)  # maketrans fails if there are nans
    art_sh = art[list((text_column, ) +
                      remain_columns)].copy()  # we don't need more columns
    del art
    gc.collect()

    additional_punctuation = string.punctuation + '«»…—’‘“”–•'  # a few additional, non-ascii chars
    # gigaom
    tt = TreebankWordTokenizer()
    art_sh[text_column] = art_sh[text_column].apply(
        lambda x: x.replace('Tweet\nShare\nPost\n', '').replace(
            '“', '').replace('”', '').replace('’', '\''))
    # sent_tokenize tokenizes by paragraphs
    art_sh[text_column_paragraphs] = art_sh[text_column].apply(
        lambda x: x.split('\n\n'))
    art_sh[text_token_column] = art_sh[text_column_paragraphs].apply(
        lambda x:
        [flatten([tt.tokenize(z) for z in sent_tokenize(y)]) for y in x])
    # to lower, stem
    art_sh[text_token_column_lower] = art_sh[text_token_column].apply(
        lambda x: [[word.lower() for word in paragraph] for paragraph in x])
    art_sh[text_token_column_stemmed] = art_sh[text_token_column_lower].apply(
        lambda x: [[st.stem(word) for word in paragraph] for paragraph in x])
    if ngrams == 2:  # convert to bigrams
        art_sh[text_token_column] = art_sh[text_token_column_lower].apply(
            to_bigram)
        art_sh[text_token_column_lower] = art_sh[
            text_token_column_lower].apply(to_bigram)
        art_sh[text_token_column_stemmed] = art_sh[
            text_token_column_stemmed].apply(to_bigram)

    art_sh[text_token_column_count] = art_sh[text_token_column_stemmed].apply(
        lambda x: dict(Counter(FreqDist(flatten(x)))))

    if mod is not None:
        art_sh.to_csv(mod + 'dfs_articles' + period + site + '.csv')

    return art_sh
示例#15
0
def tokenize(sents):
    """Identifica los tokens del las oraciones de entrada
    
    Returns:
        Una lista de oraciones. Cada oración es una lista de tokens
    """
    tokenizer = TreebankWordTokenizer()

    sent_tokens = [tokenizer.tokenize(sent) for sent in sents]

    return sent_tokens
示例#16
0
class DocumentTokenizer(object):
    """
    Used to split a document into sentences and tokens.
    Returns a list of lists TODO
    """
    def __init__(self, sent_tokenizer=None, word_tokenizer=None):
        if not sent_tokenizer:
            #self.sent_tokenizer = ClinicalRushSentenceTokenizer('rush_rules.tsv')
            self.sent_tokenizer = DefaultSentenceTokenizer()
        if not word_tokenizer:
            self.word_tokenizer = TreebankWordTokenizer()

        #self.rush = rush
        #self.word_tokenizer = word_tokenizer

    def tokenize_doc(self, doc):
        """
        Takes raw string. Returns a list of lists where each list is the
        sentence, and each sentence contains two-tuples of tokens and spans.
        """
        tokenized_sents_and_spans = []
        try:
            # sentence_span is a list of tuples of spans
            sentence_spans = self.sent_tokenizer.tokenize_sents(doc)
        except Exception as e:
            raise e
            return []
            #raise e
        for start, end in sentence_spans:
            sentence = doc[start:end]
            tokenized_sents_and_spans.append(
                self.tokenize_sent(sentence, start))
        return tokenized_sents_and_spans

    def tokenize_sent(self, sentence, offset):
        try:
            tokens = self.word_tokenizer.tokenize(sentence)
        except Exception as e:
            print("Word tokenizing failed")
            print(sentence)
            raise e
        try:
            spans = self.word_tokenizer.span_tokenize(sentence)
        except Exception as e:
            print("Span tokenizing failed")
            print(sentence)
            raise e
        tokens_and_spans = []
        for token, span in zip(tokens, spans):
            start, end = span
            true_start = start + offset
            true_end = end + offset
            tokens_and_spans.append((token, (true_start, true_end)))
        return tokens_and_spans
示例#17
0
def tokenize(review: str) -> list:
    """Tokenize string based on NLTK TreebankWordTokenizer.

    Args:
        review: The raw review content.

    Returns:
        A list of tokens found by the NLTK tokenizer.
    """
    tokenizer = TreebankWordTokenizer()
    return tokenizer.tokenize(review)
class TransformationRecommender(object):
    '''
    This class recommends a transformation according
    to the model information ModelInfo object and the query issued
    '''
    
    '''
    This object is a Singleton, since it does not have private data
    but only functions: the code below defines a singleton
    '''
    _instance = None
    
    def __new__(cls, *args, **kwargs):
        if not cls._instance:
            cls._instance = super(TransformationRecommender, cls).__new__(
                                cls, *args, **kwargs)
        return cls._instance
    
    def __init__(self):
        self.tf = TextFilter()
        self.wordTokenizer = TreebankWordTokenizer()

    def getRecommendedTransformation(self, modelInfo, query):
        '''
        If the input sentence is the same as the title except than in the title
        part that specifies the object, then "object change" shall be suggested
        '''
        title = modelInfo.getName()
        titleFiltered = self.tf.filter_all_except_stem(title)
        titleToks = self.wordTokenizer.tokenize(titleFiltered)
        
        titleToksNoObj = [t for t in titleToks if t not in modelInfo.getObjects()]
        
        queryFiltered = self.tf.filter_all_except_stem(query)
        sentenceToks = self.wordTokenizer.tokenize(queryFiltered)
        
        if set(titleToksNoObj).issubset(sentenceToks):
            return OBJECT_CHANGE
        else:
            return ''
示例#19
0
def getSubSentenceList(sentence1, sentence2, set1, set2):
    # obtain the diff words
    (set1, set2) = wordDiffSet(sentence1, sentence2)

    # generate sub sentences
    subsentL1 = []
    subsentL2 = []

    removeIdx1 = []
    removeIdx2 = []

    tokenizer = TreebankWordTokenizer()
    detokenizer = TreebankWordDetokenizer()

    sentence1L = tokenizer.tokenize(sentence1)
    sentence2L = tokenizer.tokenize(sentence2)

    for idx, word in enumerate(sentence1L):
        if word in set1:
            removeIdx1.append(idx)

    for idx, word in enumerate(sentence2L):
        if word in set2:
            removeIdx2.append(idx)

    for idx in removeIdx1:
        tokens = tokenizer.tokenize(sentence1)
        tokens.pop(idx)
        subsent = detokenizer.detokenize(tokens)
        subsentL1.append(subsent)

    for idx in removeIdx2:
        tokens = tokenizer.tokenize(sentence2)
        tokens.pop(idx)
        subsent = detokenizer.detokenize(tokens)
        subsentL2.append(subsent)

    return (subsentL1, subsentL2)
class TreebankSpanTokenizer(TreebankWordTokenizer):

    def __init__(self):
        self._word_tokenizer = TreebankWordTokenizer()

    def span_tokenize(self, text):
        ix = 0
        for word_token in self.tokenize(text):
            ix = text.find(word_token, ix)
            end = ix+len(word_token)
            yield (ix, end)
            ix = end

    def tokenize(self, text):
        return self._word_tokenizer.tokenize(text);
示例#21
0
class TreebankSpanTokenizer(TreebankWordTokenizer):

    def __init__(self):
        self._word_tokenizer = TreebankWordTokenizer()

    def span_tokenize(self, text):
        ix = 0
        for word_token in self.tokenize(text):
            ix = text.find(word_token, ix)
            end = ix+len(word_token)
            yield (ix, end)
            ix = end

    def tokenize(self, text):
        return self._word_tokenizer.tokenize(text);
示例#22
0
def word_tokenize_by_string(note):
    translator = str.maketrans('', '', string.punctuation)
    _treebank_word_tokenizer = TreebankWordTokenizer()
    note = note.translate(translator)
    note = note.replace('0','#')
    note = note.replace('1','#')
    note = note.replace('2','#')
    note = note.replace('3','#')
    note = note.replace('4','#')
    note = note.replace('5','#')
    note = note.replace('6','#')
    note = note.replace('7','#')
    note = note.replace('8','#')
    note = note.replace('9','#')
    tokenized_note = _treebank_word_tokenizer.tokenize(note)
    return tokenized_note
示例#23
0
    def tokenize_keywords(cls, texts, max_size=3, flexible_window=False):
        """Extract candidate keywords from one or more text sources.

        Args:
            texts (iterable): An iterable of strings. For example, a list of
                sentences: ['first sentence', 'second sentence', ...]
            max_size (int): The maximum number of words that each keyword will
                be made of.
            flexible_window (bool): if True, gather all ngrams whose length goes
                from 1 up to the maximum size provided. If False, only return
                the longer ngram available and discard shorter ones.

        Yields:
            A generator of keywords. Each keyword is a string composed of up
            to 3 word tokens.

        Examples:
            >>> from kwe.tokenizers import KeywordTokenizer
            >>> sents = [
            ...     'Wolves are an endangered species',
            ...     'Food is any substance consumed to provide nutritional support for the body.'
            ... ]
            >>> list(KeywordTokenizer.tokenize_keywords(sents, 3, True)) # doctest: +NORMALIZE_WHITESPACE
            [['Wolves'], ['endangered'], ['species'], ['endangered', 'species'],
            ['Food'], ['substance'], ['consumed'], ['substance', 'consumed'],
            ['provide'], ['nutritional'], ['support'], ['provide', 'nutritional'],
            ['nutritional', 'support'], ['provide', 'nutritional', 'support'],
            ['body']]
            >>> list(KeywordTokenizer.tokenize_keywords(sents, 3, False)) # doctest: +NORMALIZE_WHITESPACE
            [['Wolves'], ['endangered', 'species'], ['Food'], ['substance',
            'consumed'], ['provide', 'nutritional', 'support'], ['body']]

        """
        tokenizer = TreebankWordTokenizer()

        for text in texts:
            # Preprocessing steps
            word_tokens = tokenizer.tokenize(text)

            word_tokens = cls.remove_punctuation(word_tokens)
            chunks_without_stopwords = list(
                cls._split_at_stopwords(word_tokens))

            yield from cls.extract_ngrams(tokens=chunks_without_stopwords,
                                          size=max_size,
                                          flexible_window=flexible_window)
示例#24
0
def prepare(sentence):
    
    MAX_SEQ = 20
    tokenizer = TreebankWordTokenizer()
    sent = tokenizer.tokenize(sentence)
    for i in sent:
        n = MAX_SEQ - len(sent)
    if n < 0:
      sent = sent[:MAX_SEQ]
    else:
        for j in range(n):
            sent.append('<PAD>')
    for j in range(len(sent)):
        if sent[j] in embeddings_index:
            sent[j] = embeddings_index[sent[j]]
        else:
            sent[j] = embeddings_index["<UNK>"]
    return np.array(sent).reshape((1, 20, 300))
示例#25
0
def treebank_tokenizer(sentence, max_length=0):
    """
    Tokenize and truncate sentence
    :param sentence: str, a sentence string
    :param max_length: int, max token included in the result, 0 for unlimited
    :return: list, a list of token
    """
    # split 's but also split <>, wait to use in further work
    t = TreebankWordTokenizer()
    word_lst = t.tokenize(sentence.lower().replace("$", "_B_"))
    # word_lst = t.tokenize(sentence.lower().replace("<", "LAB_").replace(">", "_RAB"))
    ret = []
    for w in word_lst:
        ret.append(w.replace("_B_", "$"))
        # ret.append(w.replace("LAB_", "<").replace("_RAB", ">"))
    if max_length > 0:
        return ret[:max_length]
    else:
        return ret
示例#26
0
def word_tokenize(documents):
    cleaned_doc = []
    translator = str.maketrans('', '', string.punctuation)
    _treebank_word_tokenizer = TreebankWordTokenizer()
    for note in documents:
        note = note.translate(translator)
        note = note.replace('0','#')
        note = note.replace('1','#')
        note = note.replace('2','#')
        note = note.replace('3','#')
        note = note.replace('4','#')
        note = note.replace('5','#')
        note = note.replace('6','#')
        note = note.replace('7','#')
        note = note.replace('8','#')
        note = note.replace('9','#')
        tokenized_note = _treebank_word_tokenizer.tokenize(note)
        cleaned_doc.append(tokenized_note)
    return cleaned_doc
示例#27
0
class CustomizedTreebankWordTokenizer():
    def __init__(self):
        self.tokenizer = TreebankWordTokenizer()
        self.tokenizer.PARENS_BRACKETS = (re.compile(r'[\]\[\(\)\<\>]|[\{\}]+'), r' \g<0> ')

        # See discussion on https://github.com/nltk/nltk/pull/1437
        # Adding to TreebankWordTokenizer, the splits on
        # - chervon quotes u'\xab' and u'\xbb' .
        # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
        
        improved_open_quote_regex = re.compile(u'([«“‘])', re.U)
        improved_close_quote_regex = re.compile(u'([»”’])', re.U)
        improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U)
        self.tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 '))
        self.tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 '))
        self.tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))


    def tokenize(self,text):
        return [x for y in sent_tokenize(text) for x in self.tokenizer.tokenize(y)]
示例#28
0
def return_entity(sent , entity):
    global stop_words
    sent = sent.lower()
    tokenizer = TreebankWordTokenizer()
    sent = tokenizer.tokenize(sent)
    ma = 0
    ans = ""
    pos_tagged = nltk.pos_tag(sent)
    j = 0
    for i in sent:
        if i in embeddings_index:
            if i not in stop_words and pos_tagged[j][1][0:2] != "VB" and sim(embeddings_index[i] , entity) > ma:
                ma = sim(embeddings_index[i] , entity)
                ans = i
        else : print("not in embedding {}".format(i))
        j+=1
                

    if ma < .25 :
        return "nothing"
    return ans
    def tokenization(self, text, args):
        """
        Parameters
        ----------
        arg1 : list
            A list of strings where each string is a single sentence
        Returns
        -------
        list
            A list of lists where each sub-list is a sequence of tokens
        """

        t = TreebankWordTokenizer()
        tokenizedText = []
        for sentence in text:
            tokenized_sentence = t.tokenize(sentence)
            if args.restrict_tokenization:
                tokenized_sentence = [
                    word for word in tokenized_sentence if word.isalnum()
                ]
            tokenizedText.append(tokenized_sentence)
        return tokenizedText
示例#30
0
    def pennTreeBank(self, text):
        """
		Tokenization using the Penn Tree Bank Tokenizer

		Parameters
		----------
		text : list
			A list of strings where each string is a single sentence

		Returns
		-------
		list
			A list of lists where each sub-list is a sequence of tokens
		"""
        penn = TreebankWordTokenizer()
        tokenizedText = []
        for sentence in text:
            tokenizedText.append(penn.tokenize(sentence))

        # Fill in code here

        return tokenizedText
def testset_read(fn, word_idx, maxlen):
    total_num_of_unk = 0
    tokenizer = TreebankWordTokenizer()
    try:
        lines = codecs.open(fn, encoding='utf8').read().splitlines()
    except UnicodeDecodeError:
        lines = codecs.open(fn).read().splitlines()
    X = []
    sentences = []
    for line in lines:
        s = []
        for token in tokenizer.tokenize(line):
            idx = word_idx.get(token, 1)  # 1 is UNKNOWN word id
            if idx == 1:
                total_num_of_unk += 1
            s.append(idx)
        X.append(s)
        sentences.append(line)

    X = sequence.pad_sequences(X, maxlen=maxlen)

    print >> sys.stderr, "Total number of UNK={}, Avg. {}".format(total_num_of_unk, total_num_of_unk / float(len(sentences)))
    return X, sentences
class NLPProcessing(object):
    '''Perform the nlp processing
    
    
    Attributes:
        __tokenizer: Private attribute that handled the tokenizer.
        
        __r_end_sentence: Regex with the possible endings of a sentence.
    '''
    def __init__(self):
        '''
        Constructor
        '''
        self.__tokenizer = TreebankWordTokenizer()

        self.__r_end_sentence = re.compile(r"\.|\?|!")

    def tokenize_sentences_and_tokens(self, a_text):
        """Tokenize the input text
        
        Args:
            a_text: A string to be tokenize
            
        Returns:
            The result of the tokenization process, in other words, a list of 
            lists of tokens. Each list corresponds to a sentence.
        """
        tokens = [[]]
        if (len(a_text) > 0):
            if (self.__r_end_sentence.fullmatch(a_text[-1]) == None):
                a_text += '.'

            tokens = [
                self.__tokenizer.tokenize(s) for s in sent_tokenize(a_text)
            ]

        return (tokens)
def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        (in Part 2, you will also add stemming capabilities)
        and return a string that contains all the words
        in the email (space-separated) 
        
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        
        """

    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        ### remove punctuation
        text_string = content[1].translate(string.maketrans("", ""),
                                           string.punctuation)

        ### project part 2: comment out the line below
        # words = text_string
        stemmer = SnowballStemmer('english')
        t = TreebankWordTokenizer()
        text_words = t.tokenize(text_string)
        words = ' '.join([stemmer.stem(w) for w in text_words])

        ### split the text string into individual words, stem each word,
        ### and append the stemmed word to words (make sure there's a single
        ### space between each stemmed word)

    return words
示例#34
0
def consistency(s):
    """
    >>> m = consistency("Batch gradient descent algorithms "
    ...                 "... in Batch Gradient Descent ...")
    >>> [(x, sorted(y)) for (x, y) in m.items() if len(y) >= 2]
    [('gradientdescent', ['Gradient Descent', 'gradient descent'])]

    >>> m = consistency("This sentence's first word appears uncapitalized in "
    ...                 "this sentence.  Hadoop should be capitalized as "
    ...                 " Hadoop, not hadoop.")
    >>> [(x, sorted(y)) for (x, y) in m.items() if len(y) >= 2]
    [('hadoop', ['Hadoop', 'hadoop'])]

    If the second word of a sentence is capitalized, it will be be considered
    if and only if the following word is uncapitalized:

    >>> m = consistency("The Operator may be replaced by another operator")
    >>> [(x, sorted(y)) for (x, y) in m.items() if len(y) >= 2]
    [('operator', ['Operator', 'operator'])]
    >>> m = consistency("The Operator Descriptor describes an operator")
    >>> [(x, sorted(y)) for (x, y) in m.items() if len(y) >= 2]
    []

    """
    sent_tokenizer = PunktSentenceTokenizer()
    tokenizer = TreebankWordTokenizer()
    mappings = defaultdict(set)

    sentences = sent_tokenizer.tokenize(s)
    for sent in sentences:
        tokens = tokenizer.tokenize(sent)
        # The capitalization of individual words poses a problem: we would like
        # to detect cases where names are miscapitalized (e.g. hadoop instead
        # of Hadoop), but we want to avoid false-positives due to capitalized
        # words that start a sentence or are part of capitalized phrases.

        # Therefore, we only add mappings for capitalized unigrams if they do
        # not start a sentence and are not adjacent to other capitalized words.
        for i in range(1, len(tokens)):
            prev_token = tokens[i-1]
            token = tokens[i]
            if i+1 < len(tokens):
                next_token = tokens[i+1]
            else:
                next_token = ""
            adjacent_uppercase = (i > 1 and is_uppercase(prev_token)) or \
                                 is_uppercase(next_token)
            if is_uppercase(token) and adjacent_uppercase:
                continue
            norm = canonicalize(token)
            source = token.strip(",. ")
            mappings[norm].add(source)
        # Map normalized ngrams
        for x in range(2, MAX_PHRASE_LENGTH+1):
            for ngram in ngrams(tokens, x):
                norm = canonicalize(ngram)
                source = " ".join(ngram).strip(",. ")
                if len(source.split()) == x:
                    mappings[norm].add(source)

    # For normalized forms with mutiple values, filter out longer ngrams that
    # may be covered by shorter ones or that are trivial capitalization
    # differences
    for (key, values) in mappings.items():
        if len(values) > 1:
            for (a, b) in bigrams(values):
                (x, y) = [" ".join(x) for x in strip_common_fixes(a.split(),
                                                                  b.split())]
                if (x, y) != (a, b):
                    del mappings[key]
                    break
        else:
            del mappings[key]
    return mappings
示例#35
0
class QueryManager(object):
    '''
    Given a specification query, this object returns a set of models
    together with possible transformations that can be applied to the model
    to address the satisfy the specification query
    '''


    def __init__(self, modelIndexManager):
        '''
        @param modelIndex: reference to the place where the models are indexed
        '''
        self.textFilter = TextFilter()
        self.modelIndexManager = modelIndexManager
        self.wordTokenizer = TreebankWordTokenizer()
        self.tRecommender = TransformationRecommender() 
        
    def __parseQuery(self, queryString):
        '''
        This function returns the words included in queryString,
        after filtering all the stopwords, performing stemmming
        and applying all the filters provided by textFilter
        @param queryString: the specification query in the form of a string 
        ''' 
        filteredQueryString = self.textFilter.filter_all(queryString)
        return self.wordTokenizer.tokenize(filteredQueryString)
        
    def issueQuery(self, queryString):
        '''
        This is the main function of this class. Given the specification
        query, the function parses the specification and returns a
        set of QueryResult objects, which include the link to the models
        @param queryString: the specification query in the form of a string
        @return: a list of QueryResult objects.
        '''
        qr = list()
        
        stems = self.__parseQuery(queryString)
        for stem in stems:
            
            modelsInfos = self.modelIndexManager.searchModels(stem, STEM_STRING)
            
            #modelsTransformationsList = [(model, "object change") for model in models]
            #results[stem] = modelsTransformationsList
        
            if not modelsInfos == None:
                for modelInfo in modelsInfos:
                    score = 0.1
                    transformation = self.tRecommender.getRecommendedTransformation(modelInfo, queryString)
                    qr.append(QueryResult(modelInfo, [transformation], score))
                    
            qr.sort(key=lambda x: x.score) #the list is ordered by the score attribute and reversed
            qr.reverse()
        
        '''
        @todo: for each model we shall understand which is the best transformation.
        To this end, an additional class is required.
        Currently, we always add the object change transformation together 
        with each model found. 
        '''
        
        return qr   
class TextFilter(object):
    '''
    This class filter a text providing typical IR functions such as stop words removal,
    stemming and so forth
    '''


    def __init__(self):
        '''
        Constructor
        '''
        self.stopwords = open(stopwords_file, 'r').read().split()
        self.wordTokenizer = TreebankWordTokenizer()
        
    def remove_stopwords(self, string_text):
        """
        The function takes a string as input and returns a string
        without the stopwords
        """
        tokens = self.wordTokenizer.tokenize(string_text)
        filteredtext = ' '.join([t for t in tokens if t.lower() not in self.stopwords])
        return filteredtext

    def __remove_item_from_term(self, term, item):
        """
        remove charachter @param item from the @param term
        """
        return ''.join([c for c in term if c != item])
        

    def remove_item(self, string_text, item):
        """
        remove charachter @param item from the string
        """
        tokens = self.wordTokenizer.tokenize(string_text)
        filteredtext = ' '.join([self.__remove_item_from_term(t, item) for t in tokens])
        return filteredtext
    
    def stem_words(self, string_text):
        """
        The function takes a string as input and returs a string with stemmed words
        """
        tokens = self.wordTokenizer.tokenize(string_text)
        stemmer = PorterStemmer()
        stemmedtext = ' '.join([stemmer.stem(t) for t in tokens]) 
        return stemmedtext
    
    def remove_punct(self, string_text):
        """
        The function takes a string as input and returns the same string without punctuation
        """ 
        nopunct_text = ''.join([c for c in string_text if re.match("[a-zA-Z\-\' \n\t]", c)]) 
        return nopunct_text

    def lower_all(self, string_text):
        """
        Reduce each term in @param string_text to lowecase
        """
        tokens = self.wordTokenizer.tokenize(string_text)
        lowercase_string = ' '.join([t.lower() for t in tokens])
        return lowercase_string

    def remove_single_char(self, string_text):
        """
        remove single char items from @param string_text
        """
        tokens = self.wordTokenizer.tokenize(string_text)
        no_single_char_string = ' '.join([t for t in tokens if len(t) > 1])
        return no_single_char_string
    
    def filter_all(self, string_text):
        """
        executes all the filter functions on @param string_text 
        @param string_text: input text
        """
        sentence_no_punct = self.remove_punct(string_text)
        sentence_no_single_char = self.remove_single_char(sentence_no_punct)
        sentence_no_stopwords = self.remove_stopwords(sentence_no_single_char)
        filtered_sentence = self.stem_words(sentence_no_stopwords)

        filtered_sentence = self.lower_all(filtered_sentence)
        #filtered_sentence = self.lower_all(sentence_no_single_char)
        
        return filtered_sentence
示例#37
0
def main():

    text = raw_input('Enter a question...\n')
    print text
    graph_db = neo4j.GraphDatabaseService()
    batch = neo4j.WriteBatch(graph_db)

    TreeBankTokenizer = TreebankWordTokenizer()
    PunktTokenizer = PunktSentenceTokenizer()


    qIdentifiers = {
                    "What": ' ',
                    "Who": 'PERSON',
                    "Where": 'GPE',
                    "When": 'TIME',
                    "Why":'',
                    "How":''
                    }
    entities = []
    tokens = []
    for sentence in PunktTokenizer.tokenize(text):
            chunks = ne_chunk(pos_tag(TreeBankTokenizer.tokenize(sentence)))
            for chunk in chunks:
             if hasattr(chunk,'node'):
                tmp_tree = nltk.Tree(chunk.node, [(''.join(c[0] for c in chunk.leaves()))])
                tokens.append(tmp_tree)
             else:
                tokens.append(chunk[0])
            entities.extend([chunk for chunk in chunks if hasattr(chunk,'node')])

            #print chunks



    print tokens
    #entities dict
    entities_dict = {}
    for entity in entities:
        leaves = entity.leaves()
        if len(leaves) > 1 :
         entities_dict[entity.leaves()[0][0]+entity.leaves()[1][0]] = entity.node
        else :
         entities_dict[entity.leaves()[0][0]] = entity.node

    print entities_dict

    class doc():pass
    doc.headline=['']
    doc.text = tokens



  # Q&A answering algorithm
    #  Find the type of question
    qId = ''
    for key in qIdentifiers.keys():
      if key in str(text):
         print key
         qId = qIdentifiers[key]
    #  Find what kind of answer is required
    answerType = qId
    #  Find start node
    start_node = entities_dict.keys()[0]
    start_node_type = entities_dict[start_node]
    #  Run string similarity between relation text and question text
       # for the time being reading from the file

    #  Build query
    cypherQuery = "START me=node:objects(name='" + start_node + "') MATCH me-[r]->obj  RETURN r,obj.name LIMIT 10 "
    #  Start Graph traversal
    query = neo4j.CypherQuery(graph_db, cypherQuery)
    for record in query.stream():
      print 'printing records'
      print record[0]
      print record[1]
      print '\n'
示例#38
0
    def question():
        # question asking part
        qText = raw_input('Enter a question...\n')
        graph_db = neo4j.GraphDatabaseService()
        batch = neo4j.WriteBatch(graph_db)

        TreeBankTokenizer = TreebankWordTokenizer()
        PunktTokenizer = PunktSentenceTokenizer()


        qIdentifiers = {
                        "What": ' ',
                        "Who": 'PERSON',
                        "Where": 'GPE',
                        "When": 'TIME',
                        "Why":'',
                        "How":''
                        }
        entities = []
        tokens = []
        for sentence in PunktTokenizer.tokenize(qText):
                chunks = ne_chunk(pos_tag(TreeBankTokenizer.tokenize(sentence)))
                for chunk in chunks:
                 if hasattr(chunk,'node'):
                    tmp_tree = nltk.Tree(chunk.node, [(''.join(c[0] for c in chunk.leaves()))])
                    tokens.append(tmp_tree)
                 else:
                    tokens.append(chunk[0])
                entities.extend([chunk for chunk in chunks if hasattr(chunk,'node')])

                #print chunks



        #print tokens
        #entities dict
        entities_dict = {}
        for entity in entities:
            leaves = entity.leaves()
            if len(leaves) > 1 :
             entities_dict[entity.leaves()[0][0]+' '+entity.leaves()[1][0]] = entity.node
            else :
             entities_dict[entity.leaves()[0][0]] = entity.node

        #print entities_dict


      # Q&A answering algorithm
        #  Find the type of question
        qId = ''
        for key in qIdentifiers.keys():
          if key in str(qText):
             #remove key from text
             qText = qText.split(key)[1]
             print qText
             qId = qIdentifiers[key]
        #  Find what kind of answer is required
        answerType = qId
        # find relation closese to the question text
        maximum = 0.0
        queryRel = ''
        for rel in relations.keys():
            # do string comparison
            #score =  stringcomp(str(qText),str(relations[int(rel)]))
            score = SequenceMatcher(None,str(qText),str(relations[int(rel)])).ratio()
            if score > maximum :
                maximum = score
                queryRel = "`"+str(rel)+"`"

        #print queryRel
        #  Find start node
        try:
            start_node = entities_dict.keys()[0]
        except Exception, err:
            print 'No entity found in the question'
            question()
示例#39
0
    f = open(filename,'rU')
    raw = f.read()
    #normalize text
    for p in string.punctuation:
        if p != ',' :
            raw = raw.replace(p, '')
    raw = raw.strip()

    #IN = re.compile(r'.*\bin\b(?!\b.+ing)')
    tokens = []



    entities = []
    for sentence in PunktTokenizer.tokenize(raw):
            chunks = ne_chunk(pos_tag(TreeBankTokenizer.tokenize(sentence)))
            for chunk in chunks:
             if hasattr(chunk,'node'):

                tmp_tree = nltk.Tree(chunk.node, [(''.join(c[0] for c in chunk.leaves()))])
                tokens.append(tmp_tree)
             else:
                tokens.append(chunk[0])

            entities.extend([chunk for chunk in chunks if hasattr(chunk,'node')])

            #print chunks
    #raw_input('Press <enter> to continue')
    #print entities
    #entities dict
    entities_dict = {}
示例#40
0
from nltk.tokenize.treebank import TreebankWordTokenizer

VERBS = set(["geben", "helfen", "sagen", "machen", "arbeiten", "bringen"])

tokenizer = TreebankWordTokenizer()

if __name__ == "__main__":
    pattern = sys.argv[1]

    sentences = defaultdict(list)

    for ii in glob(pattern):
        for jj in map(strip, open(ii)):
            if any(jj.endswith("%s." % verb) for verb in VERBS):
                sentence = jj.split(".")[-2]
                words = tokenizer.tokenize(sentence)
                sentences[words[-1]].append(words)

    good_verbs = [x for x in sentences if len(sentences[x]) > 20]

    test_file = open("german/test.txt", 'w')

    print("COUNT:")
    for ii in good_verbs:
        print("%s\t%i" % (ii, len(sentences[ii])))

        o = open("german/verb_%s.txt" % ii, 'w')

        for (jj, sent) in enumerate(sentences[ii]):
            if jj % 5 == 0:
                test_file.write("%s\n" % ' '.join(sent))
示例#41
0
#!/usr/bin/env python

from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, ne_chunk, Tree

import re

_word_tokenizer = TreebankWordTokenizer()
_stemmer = PorterStemmer()
_lemmatizer = WordNetLemmatizer()


text = "At 12:35 a.m. ET (1735 GMT) the Dow Jones industrial average .DJI was up 211.89 points, or 1.31 percent, at 16,363.3."

# tokenize sentence
cleaned_sentence = re.sub(r'\W', ' ', text)
tokens = _word_tokenizer.tokenize(cleaned_sentence)

tokens_stemmed = [_stemmer.stem(word.lower()) for word in tokens]

print tokens_stemmed
示例#42
0

def clean_uri(uri):
    if uri.startswith("<") and uri.endswith(">"):
        return clean_uri(uri[1:-1])
    elif uri.startswith("\"") and uri.endswith("\""):
        return clean_uri(uri[1:-1])
    return uri


fname = "../data/SimpleQuestions_v2_modified/all.txt"

with open(fname, 'r') as f:
    for i, line in enumerate(f):
        if i % 1000000 == 0:
            print("line: {}".format(i))

        items = line.strip().split("\t")
        if len(items) != 5:
            print("ERROR: line - {}".format(line))
            sys.exit(0)

        lineid = items[0]
        subject = www2fb(items[1])
        predicate = www2fb(items[2])
        question = items[4].lower()

        tokenizer = TreebankWordTokenizer()
        # tokenizer = MosesTokenizer()
        tokens = tokenizer.tokenize(question)
        print("{} - {}".format(lineid, tokens))
示例#43
0
def filtering_via_syntactic_and_semantic_information_replace(pert_sent, synonyms):
    """Filter sentences by synonyms and constituency structure for PaInv-Replace.
    Returns a dictionary of original sentence to list of filtered sentences
    """
    stopWords = list(set(stopwords.words('english')))
    syn_dic = {}
    filtered_sent = {}
    stemmer = SnowballStemmer("english")
    lemmatizer = WordNetLemmatizer()

    tokenizer = TreebankWordTokenizer()
    detokenizer = TreebankWordDetokenizer()

    # Run CoreNLPPArser on local host
    eng_parser = CoreNLPParser('http://localhost:9000')

    for original_sentence in list(pert_sent.keys()):
        # Create a dictionary from original sentence to list of filtered sentences
        filtered_sent[original_sentence] = []
        tokens_or = tokenizer.tokenize(original_sentence)
        # Constituency tree of source sentence
        source_tree = [i for i, in eng_parser.raw_parse_sents([original_sentence])]
        # Get lemma of each word of source sentence
        source_lem = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(original_sentence)]
        new_sents = pert_sent[original_sentence]
        target_trees_GT = []
        num = 50
        # Generate constituency tree of each generated sentence
        for x in range(int(len(new_sents)/num)):
            target_trees_GT[(x*num):(x*num)+num] = [i for i, in eng_parser.raw_parse_sents(new_sents[(x*num):(x*num)+num])]
        x = int(len(new_sents)/num)
        target_trees_GT[(x*num):] = [i for i, in eng_parser.raw_parse_sents(new_sents[(x*num):])]
        for x in range(len(new_sents)):
            s = new_sents[x]
            target_lem = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(s)]
            # If sentence is same as original sentence then filter that
            if s.lower()==original_sentence.lower():
                continue
            # If there constituency structure is not same, then filter
            if treeDistance(target_trees_GT[x],source_tree[0]) > 1:
                continue
            # If original sentence and generate sentence have same lemma, then filter
            if target_lem == source_lem:
                continue
            # Tokens of generated sentence
            tokens_tar = tokenizer.tokenize(s)
            for i in range(len(tokens_or)):
                if tokens_or[i]!=tokens_tar[i]:
                    word1 = tokens_or[i]
                    word2 = tokens_tar[i]
                    word1_stem = stemmer.stem(word1)
                    word2_stem = stemmer.stem(word2)
                    word1_base = WordNetLemmatizer().lemmatize(word1,'v')
                    word2_base = WordNetLemmatizer().lemmatize(word2,'v')
                    # If original word and predicted word have same stem, then filter
                    if word1_stem==word2_stem:
                        continue
                    # If they are synonyms of each other, the filter
                    syn1 = synonyms(word1_base)
                    syn2 = synonyms(word2_base)
                    if (word1 in syn2) or (word1_base in syn2) or (word2 in syn1) or (word2_base in syn1):
                        continue
                    if ((word1 in stopWords) or (word2 in stopWords) or (word1_stem in stopWords)
                        or (word2_stem in stopWords) or (word1_base in stopWords) or (word2_base in stopWords)):
                        continue
                    filtered_sent[original_sentence].append(s)
    return filtered_sent
示例#44
0
class RequirementsModel(object):
    '''
    This class embeds the information residing in the XML
    of a requirements model passed as input parameter
    during construction 
    '''

    def __init__(self, modelID, inputXMLfilepath= "", modelType="", title="", objects=[]):
        '''
        Constructor
        @param modelID: identifier of the model
        @param inputXMLfilepath: path to the input XML file containing the model 
        if this parameter is left empty a new XML tree is created
        @param type: KAOS, TROPOS, or any other kind of model
        '''
        self.textFilter = TextFilter()
        self.wordTokenizer = TreebankWordTokenizer()        
        self.maxID = "100"  #@todo: we have to set the current maximum to the actual maximum value
                            #for the model
        self.modelInfo = ModelInfo(modelID)
        
        if not inputXMLfilepath == "":
            
            self.modelInfo.setLocation(inputXMLfilepath)
            
            self.tree =  ET.parse(self.modelInfo.getLocation())
        
            self.__loadModelInfo(self.modelInfo)
            self.modelGoals = self.__loadModelGoals()
            self.modelWords = self.__loadModelWords()
            self.modelStems = self.__loadModelStems()
        else:
            attributes = dict()
            attributes['type'] = modelType
            attributes['title'] = title
            attributes['object'] = objects
            root = Element("MODEL", attributes)
            self.tree = ElementTree(root)
    
    def __loadModelInfo(self, modelInfo):
        '''
        This function load the name of the model from the "title" field of the MODEL tag,
        together with the type and the objects, and stores these information in the 
        ModelInfo object
        '''
        root = self.tree.getroot()
        
        modelInfo.setName(self.textFilter.lower_all(root.get("title")))
        modelInfo.setType(self.textFilter.lower_all(root.get("type")))
        
        objects = root.get("object").strip().split(OBJ_SEPARATOR)
        lowercaseObjects = [self.textFilter.lower_all(o) for o in objects]
        modelInfo.setObjects(lowercaseObjects)   
    
    
    def __loadModelGoals(self):
        '''
        The function loads the goal names included in the model
        and returns a list with all the goals of the model.
        The goals names are stored as lowercase goals
        '''  
        root = self.tree.getroot()
        goalNames = list()

        for child in root.iter('ENTITY'):
            if child.attrib['type'] == 'goal': 
                goalNames.append(self.textFilter.lower_all(child.attrib['name'])) 
                    
        return goalNames
        
        
    def __loadModelWords(self):
        '''
        The function loads the words included in the model
        and returns a dictionary with all the words of the model
        and their frequency
        '''
               
        tokenizedWords = dict()

        if not self.modelGoals == None:
            for name in self.modelGoals:
                nameFiltered = self.textFilter.filter_all_except_stem(name)
                words = self.wordTokenizer.tokenize(nameFiltered)
                for word in words:
                    if not tokenizedWords.has_key(word): 
                        tokenizedWords[word] = 1
                    else:
                        tokenizedWords[word] = tokenizedWords[word] + 1
                    
        return tokenizedWords
        
    def __loadModelStems(self):
        '''
        The function loads the stems included in the model
        and returns a dictionary with all the stems of the model
        and their frequency
        ''' 
        tokenizedStems = dict()
        
        if not self.modelWords == None:
            for w in self.modelWords.keys():
                stem = self.textFilter.filter_all(w)
                if not tokenizedStems.has_key(stem):
                    tokenizedStems[stem] = self.modelWords[w]
                else:
                    tokenizedStems[stem] = tokenizedStems[stem] + self.modelWords[w]
                    
        return tokenizedStems
        
    def __getModelStems(self):
        return self.modelStems.keys()
    
    def __getModelWords(self):
        return self.modelWords.keys()
    
    def __getModelGoals(self):
        return self.modelGoals
    
    def __getModelStemsAndFreq(self):
        return self.modelStems
    
    def __getModelWordsAndFreq(self):
        return self.modelWords
    
    def getModelInfo(self):
        return self.modelInfo
    
    def getModelID(self):
        return self.modelInfo.getId()
    
    def getModelKeys(self, keyType):
        if keyType == STEM_STRING:
            return self.__getModelStems()
        if keyType == WORD_STRING:
            return self.__getModelWords()
        if keyType == GOAL_STRING:
            return self.__getModelGoals() 
        
    def getModelKeysAndFrequencies(self, keyType):
        if keyType == STEM_STRING:
            return self.__getModelStemsAndFreq()
        if keyType == WORD_STRING:
            return self.__getModelWordsAndFreq()
        if keyType == GOAL_STRING:
            return dict(zip(self.__getModelGoals()), [1] * (len(self.__getModelGoals())) )
            
        
    def changeTitle(self, newTitle):
        '''
        This function shall change the title of the model, 
        which means changing the modelInfo and the XML
        of the model
        '''
        #self.modelInfo.setName(newTitle)
        
        root = self.tree.getroot()
        root.set("title", newTitle)
        self.__loadModelInfo(self.modelInfo) #the function updates the modelInfo structure
    
    def changeObjects(self, newObjectsList):
        '''
        This function shall change the objects of the model,
        which means changing the modelInfo 
        but also the XML of the model
        '''
        
        root = self.tree.getroot()
        
        newObjects = ' ,'.join([o for o in newObjectsList])
        root.set("object", newObjects)  
        self.__loadModelInfo(self.modelInfo)
        
    def changeGoalName(self, goalID, newGoalName):
        '''
        @param goalID: ID of the goal that shall have a new name
        @param newGoalName: string representing the new name of the goal  
        '''
        root = self.tree.getroot()

        for child in root.iter('ENTITY'):
            if child.attrib['type'] == 'goal' and child.attrib['id'] == goalID:
                child.attrib['name'] = newGoalName 
        
    def searchGoalByName(self, goalName):
        '''
        @param goalName: name of the goal to be searched
        return: goalID, which is the unique ID of the goal, if the goal exist
                -1, if the goal is not found
        '''
        root = self.tree.getroot()

        for child in root.iter('ENTITY'):
            if child.attrib['type'] == 'goal' and child.attrib['name'] == goalName:
                return child.attrib['id']
        
        return -1 
    
    def searchGoalsBySubstring(self, goalSubstring, caseSensitive = "NO"):
        '''
        @param goalSubstring: a substring that shall be searched among the goal names. 
        By default the search is not case sensitive
        return: a list with the couples [ID, goalName] of the goals that include the @param goalSubstring
        '''
        root = self.tree.getroot()
        goalDict = dict()

        for child in root.iter('ENTITY'):
            if child.attrib['type'] == 'goal': 
                if caseSensitive == "NO":
                    if self.textFilter.lower_all(goalSubstring) in self.textFilter.lower_all(child.attrib['name']):
                        goalDict[child.attrib['id']] = child.attrib['name']
                else:
                    if goalSubstring in child.attrib['name']:
                        goalDict[child.attrib['id']] = child.attrib['name']
                
        
        return goalDict
    
    def __assignUniqueIDs(self, treeRoot):
        '''
        This function assigns unique IDs to all the objects 
        of type ENTITY in @param tree
        '''
        currentMaxId = self.maxID
        for child in treeRoot.iter('ENTITY'):
            currentMaxId = str( int(currentMaxId) + 1 )
            child.attrib['id'] = currentMaxId
            
        self.maxID = currentMaxId
    
    def insertTree(self, parentID, childTree):
        '''
        Given a @param childTree, which is a tree or a node, this is added as a child of parentID
        below the first refinement of the parent. 
        The assumption here is that each parent can have ONLY ONE TYPE of refinement.
        The unique IDs to the child elements are dynamically assigned by the function. 
        The childTree could be also a single node.
        '''
        root = self.tree.getroot()
        
        for child in root.iter('ENTITY'):
            if child.attrib['id'] == parentID:
                refinement = child.findall("REFINEMENT")
                if refinement and len(refinement) == 1: #ONLY ONE TYPE of refinement is allowed for each element
                    self.__assignUniqueIDs(childTree)
                    refinement[0].append(childTree)
                    return

    def saveModelAs(self, destinationFilePath):
        '''
        @param destinationFilePath: path of the file where the model shall be saved.
        @todo: currently the model is saved to another location and the original location
        is lost. Therefore, the model currently keeps the same ID. We have to change
        this behaviour. 
        '''
        self.modelInfo.setLocation(destinationFilePath) 
        self.saveModel()
        
    def saveModel(self):
        '''
        Save the model in the same destination as the input folder
        and with the original name
        '''
        try:
            self.tree.write(self.modelInfo.getLocation())
        except IOError:
            print "IOError: Saving to a path that does not exist! Use saveModelAs() instead"
        except:
            print "An error occurred"
def main(sysargs):
    sys.argv = sysargs
    arg_parser= argparse.ArgumentParser(description='Formats debates by removing HTML and filtering words.')
    arg_parser.add_argument('-i', '--infile', required=True, help='Debate file to format.')
    args = arg_parser.parse_args()

    # Initialize nltk elements.
    parser = SpeechHTMLParser()
    sent_splitter = PunktSentenceTokenizer()
    tokenizer = TreebankWordTokenizer()
    tagger_loc = '/het/users/jengi/stanford-postagger/'
    tagger = StanfordTagger(tagger_loc + 'models/wsj-0-18-bidirectional-distsim.tagger', \
                                tagger_loc + 'stanford-postagger.jar')
    stemmer = SnowballStemmer('english')

    # Read infile.
    speaker_pattern = re.compile('.*:')
    null_pattern = re.compile('\s*(\[[^\]]*\]|\([^\)]*\))')
    dash_pattern = re.compile('\S+(--)\s+')
    ellipse_pattern = re.compile('\s*\.\.\.\s*')
    noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']
    punct = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', \
                 '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', \
                 '\\', ']', '^', '_', '`', '{', '|', '}', '~']
    block_lengths = []
    with open(args.infile, 'r') as afile:
        file_contents = afile.read()
        parser.feed(file_contents)
        parser.close()

        num_blocks = 0
        speeches = {}
        for (speaker, block) in parser.text:
            if num_blocks % 10 == 0:
                print >> sys.stderr, 'Processing block ' + str(num_blocks) + ' ...'
            orig_block = block

            # Remove applause, laughter, etc.
            block = repeated_search(block, null_pattern, 0)

            # Remove -- from the end of words.  (Indicates stuttering / stopping.)
            block = repeated_search(block, dash_pattern, 1)

            # Do more complex tokenization.
            sents = sent_splitter.tokenize(block)
            sents = [ellipse_pattern.sub(' ... ', sent) for sent in sents]
            tokens = [tokenizer.tokenize(sent) for sent in sents]

            # Run POS tagger and keep only nouns.
            # Also lowercase and stem these nouns.
            tags = [tagger.tag(toks) for toks in tokens]
            tokens = []
            tagged_text = [];
            for sent in tags:
                tokens.append([])
                for (word, tag) in sent:
                    tagged_text.append(word);
                    tagged_text.append(tag);
                    if tag in noun_tags:
                        tokens[len(tokens) - 1].append(stemmer.stem(word.lower()))

            # Remove any "sentences" that are actually empty and
            # any tokens that are pure punctuation.
            for i in reversed(range(len(tokens))):
                for j in reversed(range(len(tokens[i]))):
                    non_punct = ''.join([tok for tok in tokens[i][j] if tok not in punct])
                    if len(non_punct) == 0:
                        del tokens[i][j]

                if len(tokens[i]) == 0:
                    del tokens[i]

            # Make sure there is still at least one sentence left.
            num_sents = len(tokens)
            if num_sents == 0:
                continue

            # Add block to speeches dictionary.
            speaker = speaker[:speaker_pattern.match(speaker).end() - 1]
            if speaker not in speeches:
                speeches[speaker] = []
            speeches[speaker].append(orig_block)
            speeches[speaker].append(' '.join(tagged_text))
            speeches[speaker].append('\n'.join([' '.join(sent) for sent in tokens]))
            #print speeches[speaker][0]
            #print speeches[speaker][1]
            #print speeches[speaker][2]

            num_blocks += 1
            num_tokens = 0
            for toks in tokens:
                num_tokens += len(toks)
            block_lengths.append(num_tokens)

    # Save each speaker's text to a file.
    (infolder, basename) = os.path.split(os.path.abspath(args.infile))
    out_prefix = infolder + '/'
    out_suffix = basename
    for speaker in speeches:
        # Create outfile prefixed by speaker's name.
        outfile = open(out_prefix + speaker + '-' + out_suffix, 'w')

        # Save text to outfile.
        blocks = speeches[speaker]
        for i in range(0, len(blocks), 3):
            print >> outfile, blocks[i]
            print >> outfile, blocks[i + 1]
            print >> outfile, blocks[i + 2]
            print >> outfile

        outfile.close()

    print '# of blocks: ' + str(num_blocks)
    print 'Mean # of tokens (per block): ' + str(scipy.mean(block_lengths))
    print 'Median # of tokens: ' + str(scipy.median(block_lengths))
    print 'Standard deviation in # of tokens: ' + str(scipy.std(block_lengths))
'''


TreeBankTokenizer = TreebankWordTokenizer()
PunktTokenizer = PunktSentenceTokenizer()
text = '''
The Boston Celtics are a National Basketball Association (NBA) team based in Boston, MA. They play in the Atlantic Division
 of the Eastern Conference. Founded in 1946, the team is currently owned by 
 Boston Basketball Partners LLC. The Celtics play their home games at the TD Garden,
 which they share with the Boston Blazers (NLL), and the Boston Bruins of the NHL.
 
 The Celtics have dominated the league during the late 50's and through the mid 80's, 
 with the help of many Hall of Famers which include Bill Russell, Bob Cousy, John Havlicek, 
 Larry Bird and legendary Celtics coach Red Auerbach, 
 combined for a 795 - 397 record that helped the Celtics win 16 Championships.
'''

sentences = PunktTokenizer.tokenize(text)
tokens = [TreeBankTokenizer.tokenize(sentence) for sentence in sentences]
tagged = [pos_tag(token) for token in tokens]
chunked = [ne_chunk(taggedToken) for taggedToken in tagged]

chunked[0].draw()
chunked[-1].draw()
chunked[-3].draw()

print chunked