class TokenizingEmbeddingVectorizer:
    """
    This vectorizer first tokenizes its input.
    """

    def __init__(self, embedding_file, ignored_tokens=set()):

        self.vectorizer = EmbeddingVectorizer(embedding_file, ignored_tokens)
        self.tokenizer = TreebankWordTokenizer()
        self.embeddings = self.vectorizer.embeddings
        self.token2Index = self.vectorizer.token2Index

    def tokenize_sentences(self, sentences):
        tokenized_sentences = list(map(lambda sentence: " ".join(self.tokenizer.tokenize(sentence)), sentences))
        return tokenized_sentences

    def prepare_data(self, sentences, labels):

        prepared_sentences = list(map(lambda sentence: " ".join(self.tokenizer.tokenize(sentence)), sentences))
        
        return self.vectorizer.prepare_data(prepared_sentences, labels)

    def sentences_to_padded_indices(self, sentences, max_length, padding="pre"):
        return self.vectorizer.sentences_to_padded_indices(sentences, max_length, padding)

    def sentences_to_indices(self, sentences):
        return np.array(self.vectorizer.sentences_to_indices(sentences))
示例#2
0
def tokenize_text(text, language="english"):
    '''Tokenize a string into a list of tokens.
    Use NLTK's Treebankwordtokenizer.
    Note that we first split into sentences using NLTK's sent_tokenize.
    We additionally call a filtering function to remove un-wanted tokens.
    
    IN:
    - text, str
    OUT:
    - list of strings
    '''
    ## list of tokens
    list_tokens = []

    ## split text into sentences
    sentences = sent_tokenize(text, language=language)

    ## define the tokenizer
    tokenizer = TreebankWordTokenizer()
    ## loop over all sentences
    for sent in sentences:
        ## tokenize the sentence
        sent_tokenized = tokenizer.tokenize(sent)
        ## lowercase the tokens
        ## add tokens to list of tokens
        list_tokens += sent_tokenized
    list_tokens = filter_tokens(list_tokens)
    return list_tokens
示例#3
0
def create_data(stories, lang="english", doc_limit=-1, delimiter=""):
    from nltk.tokenize.treebank import TreebankWordTokenizer
    tokenizer = TreebankWordTokenizer()

    from nltk.corpus import stopwords
    stop = stopwords.words('english')

    from string import ascii_lowercase

    docs = {}
    print("Found %i stories" % stories.count())
    for story in stories:
        text = zlib.decompress(story.story_content_z)
        # text = story.story_title
        text = ''.join(
            BeautifulSoup(text, features="lxml").findAll(text=True)).lower()
        if delimiter:
            sections = text.split(delimiter)
        else:
            sections = [text]

        if doc_limit > 0 and len(docs) > doc_limit:
            print("Passed doc limit %i" % len(docs))
            break
        print(story.story_title, len(sections))

        for jj in xrange(len(sections)):
            docs["%s-%i" % (story.story_title, jj)] = [x for x in tokenizer.tokenize(sections[jj]) \
                                        if (not x in stop) and \
                                        (min(y in ascii_lowercase for y in x))]
    return docs
示例#4
0
 def get_tf_idf_score(self, sentence, mode, ngram=1):
     if ngram not in range(1, 4):
         try:
             raise ValueError
         except ValueError as v:
             print "Only unigrams, bigrams and trigrams are supported."
     if mode != "lex" and mode != "pos":
         try:
             raise ValueError
         except ValueError as v:
             print "Only lexical and POS distinctness supported."
     if len(self.document_freqs_lex.keys()) == 0 or len(
             self.document_freqs_pos.keys()) == 0:
         try:
             raise AttributeError
         except AttributeError as ae:
             print "Document frequency dictionaries not initialized. Call load_doc_freqs() " \
                   "on the LM object."
     tokenizer = TreebankWordTokenizer()
     sentence = sentence.lower()
     tokens = tokenizer.tokenize(sentence)
     tokens = self.__fix_tokens(tokens)
     tags = nltk.pos_tag(tokens)
     tags = self.__add_start_end_tags(tags)
     if mode == "lex":
         score = self.__get_lex_tf_idf(tags, ngram)
         return score
     else:
         score = self.__get_pos_tf_idf(tags, ngram)
         return score
    def __init__(self):
        '''
        Constructor
        '''
        self.__tokenizer = TreebankWordTokenizer()

        self.__r_end_sentence = re.compile(r"\.|\?|!")
def tokenize(documents):
    real_tokens = []
    documents2 = []
    tbw = TreebankWordTokenizer()
    for doc in documents:
        text = doc["text"]
        file = doc["id"]
        text = text.replace("\"","'")
        #text = text.replace("/", " ")
        text = text.replace("-", " ")
        text = text.replace(".", " ")
        tokens = tbw.span_tokenize(text)
        for token in tokens:
            token_txt = text[token[0]:token[1]]
            found = False
            for tag in doc["tags"]:
                if int(tag["start"])<=token[0] and int(tag["end"])>=token[1]:
                    token_tag = tag["tag"]
                    token_tag_type = tag["type"]
                    found = True
            if found==False:
                token_tag = "O"
                token_tag_type = "O"

            real_tokens.append({"token":token_txt,"start":token[0],"end":token[1],"tag":token_tag,"tag_type":token_tag_type})
        documents2.append({"id": file, "text": text, "tags": doc["tags"],"tokens":real_tokens})
    return documents2
示例#7
0
文件: lda.py 项目: 0077cc/NewsBlur
def create_data(stories, lang="english", doc_limit=-1, delimiter=""):
  from nltk.tokenize.treebank import TreebankWordTokenizer
  tokenizer = TreebankWordTokenizer()

  from nltk.corpus import stopwords
  stop = stopwords.words('english')
  
  from string import ascii_lowercase
  
  docs = {}
  print("Found %i stories" % stories.count())
  for story in stories:
    text = zlib.decompress(story.story_content_z)
    # text = story.story_title
    text = ''.join(BeautifulSoup(text).findAll(text=True)).lower()
    if delimiter:
      sections = text.split(delimiter)
    else:
      sections = [text]
            
    if doc_limit > 0 and len(docs) > doc_limit:
      print("Passed doc limit %i" % len(docs))
      break
    print(story.story_title, len(sections))

    for jj in xrange(len(sections)):
      docs["%s-%i" % (story.story_title, jj)] = [x for x in tokenizer.tokenize(sections[jj]) \
                                  if (not x in stop) and \
                                  (min(y in ascii_lowercase for y in x))]
  return docs
示例#8
0
def tokenize():
    text = request.json["text"]
    try:
        spans = list(TreebankWordTokenizer().span_tokenize(text))
    except LookupError:
        nltk.download('punkt')
        spans = list(TreebankWordTokenizer().span_tokenize(text))
    return {"tokens": [(s[0], s[1], text[s[0]:s[1]]) for s in spans]}
示例#9
0
文件: word.py 项目: cltk/cltk
 def tokenize(self, text: str):
     """
     :rtype: list
     :param text: text to be tokenized into sentences
     :type text: str
     """
     sents = self.sent_tokenizer.tokenize(text)
     tokenizer = TreebankWordTokenizer()
     return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
示例#10
0
 def tokenize(self, text: str):
     """
     :rtype: list
     :param text: text to be tokenized into sentences
     :type text: str
     """
     sents = self.sent_tokenizer.tokenize(text)
     tokenizer = TreebankWordTokenizer()
     return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
示例#11
0
def text2sentences(path):
    # feel free to make a better tokenization/pre-processing
    sentences = []
    tokenizer = TreebankWordTokenizer()
    with open(path , encoding = 'utf8') as f:
        for l in f:
            table = str.maketrans(dict.fromkeys(string.punctuation + '0123456789')) #to remove numbers & punctuation
            sentences.append( tokenizer.tokenize(l.translate(table).lower()) )
    return sentences
def treebank_tokenizer(sentence):
    # split 's but also split <>, wait to use in further work
    t = TreebankWordTokenizer()
    word_lst = t.tokenize(sentence.lower().replace("<", "LAB_").replace(
        ">", "_RAB"))
    ret = []
    for w in word_lst:
        ret.append(w.replace("LAB_", "<").replace("_RAB", ">"))
    return ret
def read(fn, test_percentage, maxlen, max_features, dataset_type):
    """
    :param fn: dataset filename.
    :param maxlen: maximum length for each sentence.
    :param max_features: max_features (e.g., unique words, vocabulary)
    :param padding: If true, padding will be made starting and ending of each sentence.
    :return:
    """

    tokenizer = TreebankWordTokenizer()
    c = count(2)
    word_idx = {}
    try:
        lines = codecs.open(fn, encoding='utf8').read().splitlines()
    except UnicodeDecodeError:
        lines = codecs.open(fn).read().splitlines()
    y = []
    X = []
    for line in lines:
        try:
            label, sentence = line.split('\t')
        except ValueError:
            continue
        y.append(label)
        s = []
        for token in tokenizer.tokenize(sentence):
            idx = word_idx.get(token, None)
            if idx is None:
                idx = c.next()
                if idx < max_features:
                    word_idx[token] = idx
                else:
                    idx = 1
            s.append(idx)
        X.append(s)

    X = sequence.pad_sequences(X, maxlen=maxlen)
    num_instance_for_train = int(len(X) * (1 - test_percentage))

    # convert labels into floats if the labels are real-valued.
    if dataset_type == 'regression':
        y = map(lambda e: float(e), y)
    else:
        set_y = set(y)
        print >> sys.stderr, set_y
        label1, label2 = set_y  # now supporting only binary classification.
        labels = {label1: 0, label2: 1}
        y = map(lambda e: labels[e], y)  # map labels 0/1.

    y = np.array(y)

    print "training set size {}, test set size {}".format(num_instance_for_train,
                                                          max(len(X) - num_instance_for_train, 0))

    return (X[:num_instance_for_train, :], y[:num_instance_for_train]), (X[num_instance_for_train:, :],
                                                                         y[num_instance_for_train:]), word_idx
def english_tokenization(term):
  word_tokenizer = TreebankWordTokenizer()
  tokenized_term = ""

  for word in word_tokenizer.tokenize(term):
    if tokenized_term != "":
      tokenized_term += " "
    tokenized_term += word

  return tokenized_term
示例#15
0
async def tokenize(request: Request):
    body = await request.json()
    text = body["text"]
    print(text)
    try:
        spans = list(TreebankWordTokenizer().span_tokenize(text))
    except LookupError:
        nltk.download('punkt')
        spans = list(TreebankWordTokenizer().span_tokenize(text))
    return {"tokens": [(s[0], s[1], text[s[0]:s[1]]) for s in spans]}
def transform_texts(art,
                    period,
                    site,
                    ngrams=1,
                    mod=None,
                    text_column='text',
                    text_token_column='text_token',
                    remain_columns=('author', 'site', 'link')):
    """Transform dataframe with texts, create tokenized lists in columns.
    Save dataframe to mod directory, if mod is not None."""
    text_column_paragraphs = text_column + '_paragraphs'
    text_token_column_lower = text_token_column + '_lower'
    text_token_column_stemmed = text_token_column + '_stemmed'
    text_token_column_count = text_token_column + '_count'

    st = SnowballStemmer('english')
    art.dropna(subset=[text_column],
               inplace=True)  # maketrans fails if there are nans
    art_sh = art[list((text_column, ) +
                      remain_columns)].copy()  # we don't need more columns
    del art
    gc.collect()

    additional_punctuation = string.punctuation + '«»…—’‘“”–•'  # a few additional, non-ascii chars
    # gigaom
    tt = TreebankWordTokenizer()
    art_sh[text_column] = art_sh[text_column].apply(
        lambda x: x.replace('Tweet\nShare\nPost\n', '').replace(
            '“', '').replace('”', '').replace('’', '\''))
    # sent_tokenize tokenizes by paragraphs
    art_sh[text_column_paragraphs] = art_sh[text_column].apply(
        lambda x: x.split('\n\n'))
    art_sh[text_token_column] = art_sh[text_column_paragraphs].apply(
        lambda x:
        [flatten([tt.tokenize(z) for z in sent_tokenize(y)]) for y in x])
    # to lower, stem
    art_sh[text_token_column_lower] = art_sh[text_token_column].apply(
        lambda x: [[word.lower() for word in paragraph] for paragraph in x])
    art_sh[text_token_column_stemmed] = art_sh[text_token_column_lower].apply(
        lambda x: [[st.stem(word) for word in paragraph] for paragraph in x])
    if ngrams == 2:  # convert to bigrams
        art_sh[text_token_column] = art_sh[text_token_column_lower].apply(
            to_bigram)
        art_sh[text_token_column_lower] = art_sh[
            text_token_column_lower].apply(to_bigram)
        art_sh[text_token_column_stemmed] = art_sh[
            text_token_column_stemmed].apply(to_bigram)

    art_sh[text_token_column_count] = art_sh[text_token_column_stemmed].apply(
        lambda x: dict(Counter(FreqDist(flatten(x)))))

    if mod is not None:
        art_sh.to_csv(mod + 'dfs_articles' + period + site + '.csv')

    return art_sh
示例#17
0
    def __init__(self, tokenizer_method: str = "TreebankWordTokenizer"):
        self.token2idx = {}
        self.tokenizer = None

        if tokenizer_method == "TreebankWordTokenizer":
            self.tokenizer = TreebankWordTokenizer()
        else:
            raise NotImplementedError(
                "tokenizer_method {} doesn't exist".format(tokenizer_method))

        self.add_token(UNK_TOKEN)  # Add UNK token
示例#18
0
class DocumentTokenizer(object):
    """
    Used to split a document into sentences and tokens.
    Returns a list of lists TODO
    """
    def __init__(self, sent_tokenizer=None, word_tokenizer=None):
        if not sent_tokenizer:
            #self.sent_tokenizer = ClinicalRushSentenceTokenizer('rush_rules.tsv')
            self.sent_tokenizer = DefaultSentenceTokenizer()
        if not word_tokenizer:
            self.word_tokenizer = TreebankWordTokenizer()

        #self.rush = rush
        #self.word_tokenizer = word_tokenizer

    def tokenize_doc(self, doc):
        """
        Takes raw string. Returns a list of lists where each list is the
        sentence, and each sentence contains two-tuples of tokens and spans.
        """
        tokenized_sents_and_spans = []
        try:
            # sentence_span is a list of tuples of spans
            sentence_spans = self.sent_tokenizer.tokenize_sents(doc)
        except Exception as e:
            raise e
            return []
            #raise e
        for start, end in sentence_spans:
            sentence = doc[start:end]
            tokenized_sents_and_spans.append(
                self.tokenize_sent(sentence, start))
        return tokenized_sents_and_spans

    def tokenize_sent(self, sentence, offset):
        try:
            tokens = self.word_tokenizer.tokenize(sentence)
        except Exception as e:
            print("Word tokenizing failed")
            print(sentence)
            raise e
        try:
            spans = self.word_tokenizer.span_tokenize(sentence)
        except Exception as e:
            print("Span tokenizing failed")
            print(sentence)
            raise e
        tokens_and_spans = []
        for token, span in zip(tokens, spans):
            start, end = span
            true_start = start + offset
            true_end = end + offset
            tokens_and_spans.append((token, (true_start, true_end)))
        return tokens_and_spans
示例#19
0
def tokenize(sents):
    """Identifica los tokens del las oraciones de entrada
    
    Returns:
        Una lista de oraciones. Cada oración es una lista de tokens
    """
    tokenizer = TreebankWordTokenizer()

    sent_tokens = [tokenizer.tokenize(sent) for sent in sents]

    return sent_tokens
示例#20
0
def tokenize(review: str) -> list:
    """Tokenize string based on NLTK TreebankWordTokenizer.

    Args:
        review: The raw review content.

    Returns:
        A list of tokens found by the NLTK tokenizer.
    """
    tokenizer = TreebankWordTokenizer()
    return tokenizer.tokenize(review)
示例#21
0
文件: word.py 项目: cltk/cltk
 def tokenize(self, text: str):
     """
     :rtype: list
     :param text: text to be tokenized into sentences
     :type text: str
     :param model: tokenizer object to used # Should be in init?
     :type model: object
     """
     sents = self.sent_tokenizer.tokenize(text)
     tokenizer = TreebankWordTokenizer()
     return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
示例#22
0
文件: text.py 项目: lucas0/Lux
 def __init__(self, sentence_tokenizer: Any = None, paragraph_threshold: int = 150):
     """
     Constructor
     :param sentence_tokenizer: a sentences_tokenizer that provide a tokenize(t:str)->[str] method
     (for instance: nltk.data.load('tokenizers/punkt/english.pickle'))
     :param paragraph_threshold: the minimum number of characters of paragraph should contains (it will be
     filtered otherwise)
     """
     self.sentences_tokenizer = sentence_tokenizer  # might wanna use
     self.paragraph_threshold = paragraph_threshold
     self._word_tokenizer = TreebankWordTokenizer()
示例#23
0
 def tokenize(self, text: str):
     """
     :rtype: list
     :param text: text to be tokenized into sentences
     :type text: str
     :param model: tokenizer object to used # Should be in init?
     :type model: object
     """
     sents = self.sent_tokenizer.tokenize(text)
     tokenizer = TreebankWordTokenizer()
     return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
示例#24
0
    def __init__(self):
        self.tokenizer = TreebankWordTokenizer()
        self.tokenizer.PARENS_BRACKETS = (re.compile(r'[\]\[\(\)\<\>]|[\{\}]+'), r' \g<0> ')

        # See discussion on https://github.com/nltk/nltk/pull/1437
        # Adding to TreebankWordTokenizer, the splits on
        # - chervon quotes u'\xab' and u'\xbb' .
        # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
        
        improved_open_quote_regex = re.compile(u'([«“‘])', re.U)
        improved_close_quote_regex = re.compile(u'([»”’])', re.U)
        improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U)
        self.tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 '))
        self.tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 '))
        self.tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))
示例#25
0
    def __init__(self, *args, **kwargs):
        if 'tokenize' in kwargs:
            raise TypeError(
                'TreebankEncoder defines a tokenize callable TreebankWordTokenizer'
            )

        try:
            import nltk

            # Required for moses
            nltk.download('perluniprops')
            nltk.download('nonbreaking_prefixes')

            from nltk.tokenize.treebank import TreebankWordTokenizer
            from nltk.tokenize.treebank import TreebankWordDetokenizer
        except ImportError:
            print("Please install NLTK. "
                  "See the docs at http://nltk.org for more information.")
            raise

        self.detokenizer = TreebankWordDetokenizer()

        super().__init__(*args,
                         **kwargs,
                         tokenize=TreebankWordTokenizer().tokenize)
示例#26
0
def word_tokenize(text, language="spanish"):
    """
		It splits the text into words
		
		Args:
			text:		text to be splited
			language:	language of the tokenizer to be used
			
		Returns:
			List of words
	"""

    #try to use from local
    try:
        from nltk.tokenize.treebank import TreebankWordTokenizer

        _treebank_word_tokenize = TreebankWordTokenizer().tokenize

        return [
            token for sent in sent_tokenize(text)
            for token in _treebank_word_tokenize(sent)
        ]

    #if not, use nltk
    except IOError:
        from nltk import word_tokenize

        return word_tokenize(text, language)
class TreebankSpanTokenizer(TreebankWordTokenizer):
    def __init__(self):
        self._word_tokenizer = TreebankWordTokenizer()

    def span_tokenize(self, text):
        ix = 0
        for word_token in self.tokenize(text):
            ix = text.find(word_token, ix)
            end = ix + len(word_token)
            yield (ix, end)
            ix = end

    def tokenize(self, text, withSpans=False):
        tokens = self._word_tokenizer.tokenize(text)

        if not withSpans:
            return tokens

        spans = []
        ix = 0
        for word_token in tokens:
            ix = text.find(word_token, ix)
            end = ix + len(word_token)
            spans.append((ix, end))
            ix = end

        return zip(tokens, spans)
示例#28
0
def term_frequency(sentence, ngrams=4):
    """Given a sentence, calculates term frequency of tuples.

    Parameters
    ----------
    sentence : str
        Sentence whose term frequency has to be calculated.
    ngrams : int
        Number of n-grams for which term frequency is calculated.

    Returns
    -------
    dict
        {tuple : int} key-value pairs representing term frequency.
    """
    sentence = sentence.lower().strip()
    for punc in PUNCTUATIONS:
        sentence = sentence.replace(punc, "")
    words = TreebankWordTokenizer().tokenize(sentence)
    counts = {}
    for i in range(ngrams):
        for j in range(len(words) - i):
            ngram = tuple(words[j:(j + i + 1)])
            if ngram in counts:
                counts[ngram] += 1
            else:
                counts[ngram] = 1
    return counts
示例#29
0
 def __init__(self, data, tokenizer):
     self._text = to_unicode(data).strip()
     self._tokenizer = tokenizer
     self._treebank_word_tokenize = TreebankWordTokenizer().tokenize
     self.formdocument()
     self.extractsentences()
     self.extractwords()
示例#30
0
    def __init__(self, sentences_file, stopwords):
        self.dictionary = None
        self.corpus = None
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        documents = list()
        count = 0
        print "Gathering sentences and removing stopwords"
        for line in f_sentences:
            line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)

            # remove stop words and tokenize
            document = [
                word for word in TreebankWordTokenizer().tokenize(line.lower())
                if word not in stopwords
            ]
            documents.append(document)
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")

        f_sentences.close()

        self.dictionary = corpora.Dictionary(documents)
        self.corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.tf_idf_model = TfidfModel(self.corpus)

        # print(documents)
        print len(documents), "documents read"
        print len(self.dictionary), " unique tokens", self.dictionary
示例#31
0
class Dictionary:
    def __init__(self, tokenizer_method: str = "TreebankWordTokenizer"):
        self.token2idx = {}
        self.tokenizer = None

        if tokenizer_method == "TreebankWordTokenizer":
            self.tokenizer = TreebankWordTokenizer()
        else:
            raise NotImplementedError(
                "tokenizer_method {} doesn't exist".format(tokenizer_method))

        self.add_token(UNK_TOKEN)  # Add UNK token

    def build_dictionary_from_captions(self, captions: List[str]):
        for caption in captions:
            tokens = self.tokenizer.tokenize(caption)
            for token in tokens:
                self.add_token(token)

    def size(self) -> int:
        return len(self.token2idx)

    def add_token(self, token: str):
        if token not in self.token2idx:
            self.token2idx[token] = len(self.token2idx)

    def lookup_token(self, token: str) -> int:
        if token in self.token2idx:
            return self.token2idx[token]
        return self.token2idx[UNK_TOKEN]
示例#32
0
 def __init__(self, modelID, inputXMLfilepath= "", modelType="", title="", objects=[]):
     '''
     Constructor
     @param modelID: identifier of the model
     @param inputXMLfilepath: path to the input XML file containing the model 
     if this parameter is left empty a new XML tree is created
     @param type: KAOS, TROPOS, or any other kind of model
     '''
     self.textFilter = TextFilter()
     self.wordTokenizer = TreebankWordTokenizer()        
     self.maxID = "100"  #@todo: we have to set the current maximum to the actual maximum value
                         #for the model
     self.modelInfo = ModelInfo(modelID)
     
     if not inputXMLfilepath == "":
         
         self.modelInfo.setLocation(inputXMLfilepath)
         
         self.tree =  ET.parse(self.modelInfo.getLocation())
     
         self.__loadModelInfo(self.modelInfo)
         self.modelGoals = self.__loadModelGoals()
         self.modelWords = self.__loadModelWords()
         self.modelStems = self.__loadModelStems()
     else:
         attributes = dict()
         attributes['type'] = modelType
         attributes['title'] = title
         attributes['object'] = objects
         root = Element("MODEL", attributes)
         self.tree = ElementTree(root)
示例#33
0
    def __init__(self):
        filename = 'Models/CRF_crfsuite_dict.crfsuite'
        self.crf_model = pycrfsuite.Tagger()
        self._treebank_word_tokenizer = TreebankWordTokenizer()
        country_file = open("Dictionaries/Countries.txt",'r', encoding='utf-8')
        self.dictionary_country = country_file.readlines()
        self.dictionary_country = set([line[:-1] for line in self.dictionary_country])
        city_file = open("Dictionaries/Cities.txt",'r', encoding='utf-8')
        self.dictionary_city = city_file.readlines()
        self.dictionary_city = set([line[:-1] for line in self.dictionary_city])

        first_name_file = open("Dictionaries/dictionary_first_names.txt", 'r', encoding='utf-8')
        self.dictionary_first_name = first_name_file.readlines()
        self.dictionary_first_name = set([line[:-1].lower() for line in self.dictionary_first_name])

        surname_file = open("Dictionaries/dictionary_surnames.txt", 'r', encoding='utf-8')
        self.dictionary_surname = surname_file.readlines()
        self.dictionary_surname = set([line[:-1].lower() for line in self.dictionary_surname])

        if os.path.exists(filename):
            self.crf_model.open('Models/CRF_crfsuite_dict.crfsuite')
        else:
            self.crf_model = None
        self.dictionary_job_titles = []
        with open('Dictionaries/job_title_dictionary.csv', encoding='utf-8') as csv_file:
            csv_reader = csv.reader(csv_file,delimiter=',')
            for row in csv_reader:
                if row[2]=='assignedrole':
                    candidates = row[0].lower().split(' ')
                    for can in candidates:
                        if len(can)>2:
                            self.dictionary_job_titles.append(can)
        self.dictionary_job_titles = set(self.dictionary_job_titles)
        pass
示例#34
0
def word_tokenize_by_string(note):
    translator = str.maketrans('', '', string.punctuation)
    _treebank_word_tokenizer = TreebankWordTokenizer()
    note = note.translate(translator)
    note = note.replace('0','#')
    note = note.replace('1','#')
    note = note.replace('2','#')
    note = note.replace('3','#')
    note = note.replace('4','#')
    note = note.replace('5','#')
    note = note.replace('6','#')
    note = note.replace('7','#')
    note = note.replace('8','#')
    note = note.replace('9','#')
    tokenized_note = _treebank_word_tokenizer.tokenize(note)
    return tokenized_note
示例#35
0
def generate_syntactically_similar_sentences_replace(num_of_perturb, dataset):
	"""Generate syntactically similar sentences for each sentence in the dataset.
	For PaInv-Replace
	Returns dictionary of original sentence to list of generated sentences
	"""
	# Use nltk treebank tokenizer and detokenizer
	tokenizer = TreebankWordTokenizer()
	detokenizer = TreebankWordDetokenizer()

	# Stopwords from nltk
	stopWords = list(set(stopwords.words('english')))

	# File from which sentences are read
	file = open(dataset, "r")

	# when we use Bert
	berttokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
	bertmodel = BertForMaskedLM.from_pretrained('bert-large-uncased')
	bertmodel.eval()

	# Number of perturbations you want to make for a word in a sentence
	dic = {}
	num_of_perturb = 50
	num_sent = 0
	for line in file:
		s_list = line.split("\n")
		source_sent = s_list[0]
		# Generating new sentences using BERT
		new_sents = perturb(source_sent, bertmodel, num_of_perturb)
		dic[line] = new_sents		
		if new_sents != []:
			num_sent += 1
	return dic
示例#36
0
 def __init__(self, modelIndexManager):
     '''
     @param modelIndex: reference to the place where the models are indexed
     '''
     self.textFilter = TextFilter()
     self.modelIndexManager = modelIndexManager
     self.wordTokenizer = TreebankWordTokenizer()
     self.tRecommender = TransformationRecommender() 
示例#37
0
文件: word.py 项目: cltk/cltk
    def tokenize(self, text: str, split_enclitics:list = ['ne', 'n', 'que', 've', 'ue', 'st'],
                                  split_words:list = []):
        """
        :rtype: list
        :param text: text to be tokenized into sentences
        :type text: str
        :param model: tokenizer object to used # Should be in init?
        :type model: object
        """
        if self._latin_replacements:
            split_words = self._latin_replacements

        if split_words:
            text = self._replace_patterns(text, split_words)
        sents = self.sent_tokenizer.tokenize(text)
        if split_enclitics:
            sents = self._split_enclitics(sents, split_enclitics)
        tokenizer = TreebankWordTokenizer()
        return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
class TransformationRecommender(object):
    '''
    This class recommends a transformation according
    to the model information ModelInfo object and the query issued
    '''
    
    '''
    This object is a Singleton, since it does not have private data
    but only functions: the code below defines a singleton
    '''
    _instance = None
    
    def __new__(cls, *args, **kwargs):
        if not cls._instance:
            cls._instance = super(TransformationRecommender, cls).__new__(
                                cls, *args, **kwargs)
        return cls._instance
    
    def __init__(self):
        self.tf = TextFilter()
        self.wordTokenizer = TreebankWordTokenizer()

    def getRecommendedTransformation(self, modelInfo, query):
        '''
        If the input sentence is the same as the title except than in the title
        part that specifies the object, then "object change" shall be suggested
        '''
        title = modelInfo.getName()
        titleFiltered = self.tf.filter_all_except_stem(title)
        titleToks = self.wordTokenizer.tokenize(titleFiltered)
        
        titleToksNoObj = [t for t in titleToks if t not in modelInfo.getObjects()]
        
        queryFiltered = self.tf.filter_all_except_stem(query)
        sentenceToks = self.wordTokenizer.tokenize(queryFiltered)
        
        if set(titleToksNoObj).issubset(sentenceToks):
            return OBJECT_CHANGE
        else:
            return ''
def testset_read(fn, word_idx, maxlen):
    total_num_of_unk = 0
    tokenizer = TreebankWordTokenizer()
    try:
        lines = codecs.open(fn, encoding='utf8').read().splitlines()
    except UnicodeDecodeError:
        lines = codecs.open(fn).read().splitlines()
    X = []
    sentences = []
    for line in lines:
        s = []
        for token in tokenizer.tokenize(line):
            idx = word_idx.get(token, 1)  # 1 is UNKNOWN word id
            if idx == 1:
                total_num_of_unk += 1
            s.append(idx)
        X.append(s)
        sentences.append(line)

    X = sequence.pad_sequences(X, maxlen=maxlen)

    print >> sys.stderr, "Total number of UNK={}, Avg. {}".format(total_num_of_unk, total_num_of_unk / float(len(sentences)))
    return X, sentences
class TreebankSpanTokenizer(TreebankWordTokenizer):

    def __init__(self):
        self._word_tokenizer = TreebankWordTokenizer()

    def span_tokenize(self, text):
        ix = 0
        for word_token in self.tokenize(text):
            ix = text.find(word_token, ix)
            end = ix+len(word_token)
            yield (ix, end)
            ix = end

    def tokenize(self, text):
        return self._word_tokenizer.tokenize(text);
 def __init__(self):
     self.tf = TextFilter()
     self.wordTokenizer = TreebankWordTokenizer()
 def __init__(self):
     self._word_tokenizer = TreebankWordTokenizer()
示例#43
0
class RequirementsModel(object):
    '''
    This class embeds the information residing in the XML
    of a requirements model passed as input parameter
    during construction 
    '''

    def __init__(self, modelID, inputXMLfilepath= "", modelType="", title="", objects=[]):
        '''
        Constructor
        @param modelID: identifier of the model
        @param inputXMLfilepath: path to the input XML file containing the model 
        if this parameter is left empty a new XML tree is created
        @param type: KAOS, TROPOS, or any other kind of model
        '''
        self.textFilter = TextFilter()
        self.wordTokenizer = TreebankWordTokenizer()        
        self.maxID = "100"  #@todo: we have to set the current maximum to the actual maximum value
                            #for the model
        self.modelInfo = ModelInfo(modelID)
        
        if not inputXMLfilepath == "":
            
            self.modelInfo.setLocation(inputXMLfilepath)
            
            self.tree =  ET.parse(self.modelInfo.getLocation())
        
            self.__loadModelInfo(self.modelInfo)
            self.modelGoals = self.__loadModelGoals()
            self.modelWords = self.__loadModelWords()
            self.modelStems = self.__loadModelStems()
        else:
            attributes = dict()
            attributes['type'] = modelType
            attributes['title'] = title
            attributes['object'] = objects
            root = Element("MODEL", attributes)
            self.tree = ElementTree(root)
    
    def __loadModelInfo(self, modelInfo):
        '''
        This function load the name of the model from the "title" field of the MODEL tag,
        together with the type and the objects, and stores these information in the 
        ModelInfo object
        '''
        root = self.tree.getroot()
        
        modelInfo.setName(self.textFilter.lower_all(root.get("title")))
        modelInfo.setType(self.textFilter.lower_all(root.get("type")))
        
        objects = root.get("object").strip().split(OBJ_SEPARATOR)
        lowercaseObjects = [self.textFilter.lower_all(o) for o in objects]
        modelInfo.setObjects(lowercaseObjects)   
    
    
    def __loadModelGoals(self):
        '''
        The function loads the goal names included in the model
        and returns a list with all the goals of the model.
        The goals names are stored as lowercase goals
        '''  
        root = self.tree.getroot()
        goalNames = list()

        for child in root.iter('ENTITY'):
            if child.attrib['type'] == 'goal': 
                goalNames.append(self.textFilter.lower_all(child.attrib['name'])) 
                    
        return goalNames
        
        
    def __loadModelWords(self):
        '''
        The function loads the words included in the model
        and returns a dictionary with all the words of the model
        and their frequency
        '''
               
        tokenizedWords = dict()

        if not self.modelGoals == None:
            for name in self.modelGoals:
                nameFiltered = self.textFilter.filter_all_except_stem(name)
                words = self.wordTokenizer.tokenize(nameFiltered)
                for word in words:
                    if not tokenizedWords.has_key(word): 
                        tokenizedWords[word] = 1
                    else:
                        tokenizedWords[word] = tokenizedWords[word] + 1
                    
        return tokenizedWords
        
    def __loadModelStems(self):
        '''
        The function loads the stems included in the model
        and returns a dictionary with all the stems of the model
        and their frequency
        ''' 
        tokenizedStems = dict()
        
        if not self.modelWords == None:
            for w in self.modelWords.keys():
                stem = self.textFilter.filter_all(w)
                if not tokenizedStems.has_key(stem):
                    tokenizedStems[stem] = self.modelWords[w]
                else:
                    tokenizedStems[stem] = tokenizedStems[stem] + self.modelWords[w]
                    
        return tokenizedStems
        
    def __getModelStems(self):
        return self.modelStems.keys()
    
    def __getModelWords(self):
        return self.modelWords.keys()
    
    def __getModelGoals(self):
        return self.modelGoals
    
    def __getModelStemsAndFreq(self):
        return self.modelStems
    
    def __getModelWordsAndFreq(self):
        return self.modelWords
    
    def getModelInfo(self):
        return self.modelInfo
    
    def getModelID(self):
        return self.modelInfo.getId()
    
    def getModelKeys(self, keyType):
        if keyType == STEM_STRING:
            return self.__getModelStems()
        if keyType == WORD_STRING:
            return self.__getModelWords()
        if keyType == GOAL_STRING:
            return self.__getModelGoals() 
        
    def getModelKeysAndFrequencies(self, keyType):
        if keyType == STEM_STRING:
            return self.__getModelStemsAndFreq()
        if keyType == WORD_STRING:
            return self.__getModelWordsAndFreq()
        if keyType == GOAL_STRING:
            return dict(zip(self.__getModelGoals()), [1] * (len(self.__getModelGoals())) )
            
        
    def changeTitle(self, newTitle):
        '''
        This function shall change the title of the model, 
        which means changing the modelInfo and the XML
        of the model
        '''
        #self.modelInfo.setName(newTitle)
        
        root = self.tree.getroot()
        root.set("title", newTitle)
        self.__loadModelInfo(self.modelInfo) #the function updates the modelInfo structure
    
    def changeObjects(self, newObjectsList):
        '''
        This function shall change the objects of the model,
        which means changing the modelInfo 
        but also the XML of the model
        '''
        
        root = self.tree.getroot()
        
        newObjects = ' ,'.join([o for o in newObjectsList])
        root.set("object", newObjects)  
        self.__loadModelInfo(self.modelInfo)
        
    def changeGoalName(self, goalID, newGoalName):
        '''
        @param goalID: ID of the goal that shall have a new name
        @param newGoalName: string representing the new name of the goal  
        '''
        root = self.tree.getroot()

        for child in root.iter('ENTITY'):
            if child.attrib['type'] == 'goal' and child.attrib['id'] == goalID:
                child.attrib['name'] = newGoalName 
        
    def searchGoalByName(self, goalName):
        '''
        @param goalName: name of the goal to be searched
        return: goalID, which is the unique ID of the goal, if the goal exist
                -1, if the goal is not found
        '''
        root = self.tree.getroot()

        for child in root.iter('ENTITY'):
            if child.attrib['type'] == 'goal' and child.attrib['name'] == goalName:
                return child.attrib['id']
        
        return -1 
    
    def searchGoalsBySubstring(self, goalSubstring, caseSensitive = "NO"):
        '''
        @param goalSubstring: a substring that shall be searched among the goal names. 
        By default the search is not case sensitive
        return: a list with the couples [ID, goalName] of the goals that include the @param goalSubstring
        '''
        root = self.tree.getroot()
        goalDict = dict()

        for child in root.iter('ENTITY'):
            if child.attrib['type'] == 'goal': 
                if caseSensitive == "NO":
                    if self.textFilter.lower_all(goalSubstring) in self.textFilter.lower_all(child.attrib['name']):
                        goalDict[child.attrib['id']] = child.attrib['name']
                else:
                    if goalSubstring in child.attrib['name']:
                        goalDict[child.attrib['id']] = child.attrib['name']
                
        
        return goalDict
    
    def __assignUniqueIDs(self, treeRoot):
        '''
        This function assigns unique IDs to all the objects 
        of type ENTITY in @param tree
        '''
        currentMaxId = self.maxID
        for child in treeRoot.iter('ENTITY'):
            currentMaxId = str( int(currentMaxId) + 1 )
            child.attrib['id'] = currentMaxId
            
        self.maxID = currentMaxId
    
    def insertTree(self, parentID, childTree):
        '''
        Given a @param childTree, which is a tree or a node, this is added as a child of parentID
        below the first refinement of the parent. 
        The assumption here is that each parent can have ONLY ONE TYPE of refinement.
        The unique IDs to the child elements are dynamically assigned by the function. 
        The childTree could be also a single node.
        '''
        root = self.tree.getroot()
        
        for child in root.iter('ENTITY'):
            if child.attrib['id'] == parentID:
                refinement = child.findall("REFINEMENT")
                if refinement and len(refinement) == 1: #ONLY ONE TYPE of refinement is allowed for each element
                    self.__assignUniqueIDs(childTree)
                    refinement[0].append(childTree)
                    return

    def saveModelAs(self, destinationFilePath):
        '''
        @param destinationFilePath: path of the file where the model shall be saved.
        @todo: currently the model is saved to another location and the original location
        is lost. Therefore, the model currently keeps the same ID. We have to change
        this behaviour. 
        '''
        self.modelInfo.setLocation(destinationFilePath) 
        self.saveModel()
        
    def saveModel(self):
        '''
        Save the model in the same destination as the input folder
        and with the original name
        '''
        try:
            self.tree.write(self.modelInfo.getLocation())
        except IOError:
            print "IOError: Saving to a path that does not exist! Use saveModelAs() instead"
        except:
            print "An error occurred"
示例#44
0
 def __init__(self):
     '''
     Constructor
     '''
     self.stopwords = stopwords.words('english')
     self.wordTokenizer = TreebankWordTokenizer()
from nltk import ne_chunk,pos_tag
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
'''
	import nltk
	nltk.download('words')
	nltk.download('punkt')
	nltk.download('maxent_treebank_pos_tagger')
	nltk.download('maxent_ne_chunker')
'''


TreeBankTokenizer = TreebankWordTokenizer()
PunktTokenizer = PunktSentenceTokenizer()
text = '''
The Boston Celtics are a National Basketball Association (NBA) team based in Boston, MA. They play in the Atlantic Division
 of the Eastern Conference. Founded in 1946, the team is currently owned by 
 Boston Basketball Partners LLC. The Celtics play their home games at the TD Garden,
 which they share with the Boston Blazers (NLL), and the Boston Bruins of the NHL.
 
 The Celtics have dominated the league during the late 50's and through the mid 80's, 
 with the help of many Hall of Famers which include Bill Russell, Bob Cousy, John Havlicek, 
 Larry Bird and legendary Celtics coach Red Auerbach, 
 combined for a 795 - 397 record that helped the Celtics win 16 Championships.
'''

sentences = PunktTokenizer.tokenize(text)
tokens = [TreeBankTokenizer.tokenize(sentence) for sentence in sentences]
tagged = [pos_tag(token) for token in tokens]
chunked = [ne_chunk(taggedToken) for taggedToken in tagged]
示例#46
0
#!/usr/bin/env python

from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, ne_chunk, Tree

import re

_word_tokenizer = TreebankWordTokenizer()
_stemmer = PorterStemmer()
_lemmatizer = WordNetLemmatizer()


text = "At 12:35 a.m. ET (1735 GMT) the Dow Jones industrial average .DJI was up 211.89 points, or 1.31 percent, at 16,363.3."

# tokenize sentence
cleaned_sentence = re.sub(r'\W', ' ', text)
tokens = _word_tokenizer.tokenize(cleaned_sentence)

tokens_stemmed = [_stemmer.stem(word.lower()) for word in tokens]

print tokens_stemmed
 def __init__(self):
     '''
     Constructor
     '''
     self.stopwords = open(stopwords_file, 'r').read().split()
     self.wordTokenizer = TreebankWordTokenizer()
示例#48
0
def consistency(s):
    """
    >>> m = consistency("Batch gradient descent algorithms "
    ...                 "... in Batch Gradient Descent ...")
    >>> [(x, sorted(y)) for (x, y) in m.items() if len(y) >= 2]
    [('gradientdescent', ['Gradient Descent', 'gradient descent'])]

    >>> m = consistency("This sentence's first word appears uncapitalized in "
    ...                 "this sentence.  Hadoop should be capitalized as "
    ...                 " Hadoop, not hadoop.")
    >>> [(x, sorted(y)) for (x, y) in m.items() if len(y) >= 2]
    [('hadoop', ['Hadoop', 'hadoop'])]

    If the second word of a sentence is capitalized, it will be be considered
    if and only if the following word is uncapitalized:

    >>> m = consistency("The Operator may be replaced by another operator")
    >>> [(x, sorted(y)) for (x, y) in m.items() if len(y) >= 2]
    [('operator', ['Operator', 'operator'])]
    >>> m = consistency("The Operator Descriptor describes an operator")
    >>> [(x, sorted(y)) for (x, y) in m.items() if len(y) >= 2]
    []

    """
    sent_tokenizer = PunktSentenceTokenizer()
    tokenizer = TreebankWordTokenizer()
    mappings = defaultdict(set)

    sentences = sent_tokenizer.tokenize(s)
    for sent in sentences:
        tokens = tokenizer.tokenize(sent)
        # The capitalization of individual words poses a problem: we would like
        # to detect cases where names are miscapitalized (e.g. hadoop instead
        # of Hadoop), but we want to avoid false-positives due to capitalized
        # words that start a sentence or are part of capitalized phrases.

        # Therefore, we only add mappings for capitalized unigrams if they do
        # not start a sentence and are not adjacent to other capitalized words.
        for i in range(1, len(tokens)):
            prev_token = tokens[i-1]
            token = tokens[i]
            if i+1 < len(tokens):
                next_token = tokens[i+1]
            else:
                next_token = ""
            adjacent_uppercase = (i > 1 and is_uppercase(prev_token)) or \
                                 is_uppercase(next_token)
            if is_uppercase(token) and adjacent_uppercase:
                continue
            norm = canonicalize(token)
            source = token.strip(",. ")
            mappings[norm].add(source)
        # Map normalized ngrams
        for x in range(2, MAX_PHRASE_LENGTH+1):
            for ngram in ngrams(tokens, x):
                norm = canonicalize(ngram)
                source = " ".join(ngram).strip(",. ")
                if len(source.split()) == x:
                    mappings[norm].add(source)

    # For normalized forms with mutiple values, filter out longer ngrams that
    # may be covered by shorter ones or that are trivial capitalization
    # differences
    for (key, values) in mappings.items():
        if len(values) > 1:
            for (a, b) in bigrams(values):
                (x, y) = [" ".join(x) for x in strip_common_fixes(a.split(),
                                                                  b.split())]
                if (x, y) != (a, b):
                    del mappings[key]
                    break
        else:
            del mappings[key]
    return mappings
class TextFilter(object):
    '''
    This class filter a text providing typical IR functions such as stop words removal,
    stemming and so forth
    '''


    def __init__(self):
        '''
        Constructor
        '''
        self.stopwords = open(stopwords_file, 'r').read().split()
        self.wordTokenizer = TreebankWordTokenizer()
        
    def remove_stopwords(self, string_text):
        """
        The function takes a string as input and returns a string
        without the stopwords
        """
        tokens = self.wordTokenizer.tokenize(string_text)
        filteredtext = ' '.join([t for t in tokens if t.lower() not in self.stopwords])
        return filteredtext

    def __remove_item_from_term(self, term, item):
        """
        remove charachter @param item from the @param term
        """
        return ''.join([c for c in term if c != item])
        

    def remove_item(self, string_text, item):
        """
        remove charachter @param item from the string
        """
        tokens = self.wordTokenizer.tokenize(string_text)
        filteredtext = ' '.join([self.__remove_item_from_term(t, item) for t in tokens])
        return filteredtext
    
    def stem_words(self, string_text):
        """
        The function takes a string as input and returs a string with stemmed words
        """
        tokens = self.wordTokenizer.tokenize(string_text)
        stemmer = PorterStemmer()
        stemmedtext = ' '.join([stemmer.stem(t) for t in tokens]) 
        return stemmedtext
    
    def remove_punct(self, string_text):
        """
        The function takes a string as input and returns the same string without punctuation
        """ 
        nopunct_text = ''.join([c for c in string_text if re.match("[a-zA-Z\-\' \n\t]", c)]) 
        return nopunct_text

    def lower_all(self, string_text):
        """
        Reduce each term in @param string_text to lowecase
        """
        tokens = self.wordTokenizer.tokenize(string_text)
        lowercase_string = ' '.join([t.lower() for t in tokens])
        return lowercase_string

    def remove_single_char(self, string_text):
        """
        remove single char items from @param string_text
        """
        tokens = self.wordTokenizer.tokenize(string_text)
        no_single_char_string = ' '.join([t for t in tokens if len(t) > 1])
        return no_single_char_string
    
    def filter_all(self, string_text):
        """
        executes all the filter functions on @param string_text 
        @param string_text: input text
        """
        sentence_no_punct = self.remove_punct(string_text)
        sentence_no_single_char = self.remove_single_char(sentence_no_punct)
        sentence_no_stopwords = self.remove_stopwords(sentence_no_single_char)
        filtered_sentence = self.stem_words(sentence_no_stopwords)

        filtered_sentence = self.lower_all(filtered_sentence)
        #filtered_sentence = self.lower_all(sentence_no_single_char)
        
        return filtered_sentence
示例#50
0
class QueryManager(object):
    '''
    Given a specification query, this object returns a set of models
    together with possible transformations that can be applied to the model
    to address the satisfy the specification query
    '''


    def __init__(self, modelIndexManager):
        '''
        @param modelIndex: reference to the place where the models are indexed
        '''
        self.textFilter = TextFilter()
        self.modelIndexManager = modelIndexManager
        self.wordTokenizer = TreebankWordTokenizer()
        self.tRecommender = TransformationRecommender() 
        
    def __parseQuery(self, queryString):
        '''
        This function returns the words included in queryString,
        after filtering all the stopwords, performing stemmming
        and applying all the filters provided by textFilter
        @param queryString: the specification query in the form of a string 
        ''' 
        filteredQueryString = self.textFilter.filter_all(queryString)
        return self.wordTokenizer.tokenize(filteredQueryString)
        
    def issueQuery(self, queryString):
        '''
        This is the main function of this class. Given the specification
        query, the function parses the specification and returns a
        set of QueryResult objects, which include the link to the models
        @param queryString: the specification query in the form of a string
        @return: a list of QueryResult objects.
        '''
        qr = list()
        
        stems = self.__parseQuery(queryString)
        for stem in stems:
            
            modelsInfos = self.modelIndexManager.searchModels(stem, STEM_STRING)
            
            #modelsTransformationsList = [(model, "object change") for model in models]
            #results[stem] = modelsTransformationsList
        
            if not modelsInfos == None:
                for modelInfo in modelsInfos:
                    score = 0.1
                    transformation = self.tRecommender.getRecommendedTransformation(modelInfo, queryString)
                    qr.append(QueryResult(modelInfo, [transformation], score))
                    
            qr.sort(key=lambda x: x.score) #the list is ordered by the score attribute and reversed
            qr.reverse()
        
        '''
        @todo: for each model we shall understand which is the best transformation.
        To this end, an additional class is required.
        Currently, we always add the object change transformation together 
        with each model found. 
        '''
        
        return qr   
示例#51
0
from collections import defaultdict
from glob import glob
from string import strip
import sys

from nltk.tokenize.treebank import TreebankWordTokenizer

VERBS = set(["geben", "helfen", "sagen", "machen", "arbeiten", "bringen"])

tokenizer = TreebankWordTokenizer()

if __name__ == "__main__":
    pattern = sys.argv[1]

    sentences = defaultdict(list)

    for ii in glob(pattern):
        for jj in map(strip, open(ii)):
            if any(jj.endswith("%s." % verb) for verb in VERBS):
                sentence = jj.split(".")[-2]
                words = tokenizer.tokenize(sentence)
                sentences[words[-1]].append(words)

    good_verbs = [x for x in sentences if len(sentences[x]) > 20]

    test_file = open("german/test.txt", 'w')

    print("COUNT:")
    for ii in good_verbs:
        print("%s\t%i" % (ii, len(sentences[ii])))
示例#52
0
        query = neo4j.CypherQuery(graph_db, cypherQuery)
        for record in query.stream():
          if len(record) > 1: # a relationship exists
           break


        print "The answer is..."
        print answer_node
        raw_input ('Press enter to ask more')
        question()

    #Creating a Knowledge Graph sort of thing
    graph_db = neo4j.GraphDatabaseService()
    batch = neo4j.WriteBatch(graph_db)

    TreeBankTokenizer = TreebankWordTokenizer()
    PunktTokenizer = PunktSentenceTokenizer()

    filename = raw_input("Enter file name\n")
    f = open(filename,'rU')
    raw = f.read()
    #normalize text
    for p in string.punctuation:
        if p != ',' :
            raw = raw.replace(p, '')
    raw = raw.strip()

    #IN = re.compile(r'.*\bin\b(?!\b.+ing)')
    tokens = []

示例#53
0
    def question():
        # question asking part
        qText = raw_input('Enter a question...\n')
        graph_db = neo4j.GraphDatabaseService()
        batch = neo4j.WriteBatch(graph_db)

        TreeBankTokenizer = TreebankWordTokenizer()
        PunktTokenizer = PunktSentenceTokenizer()


        qIdentifiers = {
                        "What": ' ',
                        "Who": 'PERSON',
                        "Where": 'GPE',
                        "When": 'TIME',
                        "Why":'',
                        "How":''
                        }
        entities = []
        tokens = []
        for sentence in PunktTokenizer.tokenize(qText):
                chunks = ne_chunk(pos_tag(TreeBankTokenizer.tokenize(sentence)))
                for chunk in chunks:
                 if hasattr(chunk,'node'):
                    tmp_tree = nltk.Tree(chunk.node, [(''.join(c[0] for c in chunk.leaves()))])
                    tokens.append(tmp_tree)
                 else:
                    tokens.append(chunk[0])
                entities.extend([chunk for chunk in chunks if hasattr(chunk,'node')])

                #print chunks



        #print tokens
        #entities dict
        entities_dict = {}
        for entity in entities:
            leaves = entity.leaves()
            if len(leaves) > 1 :
             entities_dict[entity.leaves()[0][0]+' '+entity.leaves()[1][0]] = entity.node
            else :
             entities_dict[entity.leaves()[0][0]] = entity.node

        #print entities_dict


      # Q&A answering algorithm
        #  Find the type of question
        qId = ''
        for key in qIdentifiers.keys():
          if key in str(qText):
             #remove key from text
             qText = qText.split(key)[1]
             print qText
             qId = qIdentifiers[key]
        #  Find what kind of answer is required
        answerType = qId
        # find relation closese to the question text
        maximum = 0.0
        queryRel = ''
        for rel in relations.keys():
            # do string comparison
            #score =  stringcomp(str(qText),str(relations[int(rel)]))
            score = SequenceMatcher(None,str(qText),str(relations[int(rel)])).ratio()
            if score > maximum :
                maximum = score
                queryRel = "`"+str(rel)+"`"

        #print queryRel
        #  Find start node
        try:
            start_node = entities_dict.keys()[0]
        except Exception, err:
            print 'No entity found in the question'
            question()
def main(sysargs):
    sys.argv = sysargs
    arg_parser= argparse.ArgumentParser(description='Formats debates by removing HTML and filtering words.')
    arg_parser.add_argument('-i', '--infile', required=True, help='Debate file to format.')
    args = arg_parser.parse_args()

    # Initialize nltk elements.
    parser = SpeechHTMLParser()
    sent_splitter = PunktSentenceTokenizer()
    tokenizer = TreebankWordTokenizer()
    tagger_loc = '/het/users/jengi/stanford-postagger/'
    tagger = StanfordTagger(tagger_loc + 'models/wsj-0-18-bidirectional-distsim.tagger', \
                                tagger_loc + 'stanford-postagger.jar')
    stemmer = SnowballStemmer('english')

    # Read infile.
    speaker_pattern = re.compile('.*:')
    null_pattern = re.compile('\s*(\[[^\]]*\]|\([^\)]*\))')
    dash_pattern = re.compile('\S+(--)\s+')
    ellipse_pattern = re.compile('\s*\.\.\.\s*')
    noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']
    punct = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', \
                 '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', \
                 '\\', ']', '^', '_', '`', '{', '|', '}', '~']
    block_lengths = []
    with open(args.infile, 'r') as afile:
        file_contents = afile.read()
        parser.feed(file_contents)
        parser.close()

        num_blocks = 0
        speeches = {}
        for (speaker, block) in parser.text:
            if num_blocks % 10 == 0:
                print >> sys.stderr, 'Processing block ' + str(num_blocks) + ' ...'
            orig_block = block

            # Remove applause, laughter, etc.
            block = repeated_search(block, null_pattern, 0)

            # Remove -- from the end of words.  (Indicates stuttering / stopping.)
            block = repeated_search(block, dash_pattern, 1)

            # Do more complex tokenization.
            sents = sent_splitter.tokenize(block)
            sents = [ellipse_pattern.sub(' ... ', sent) for sent in sents]
            tokens = [tokenizer.tokenize(sent) for sent in sents]

            # Run POS tagger and keep only nouns.
            # Also lowercase and stem these nouns.
            tags = [tagger.tag(toks) for toks in tokens]
            tokens = []
            tagged_text = [];
            for sent in tags:
                tokens.append([])
                for (word, tag) in sent:
                    tagged_text.append(word);
                    tagged_text.append(tag);
                    if tag in noun_tags:
                        tokens[len(tokens) - 1].append(stemmer.stem(word.lower()))

            # Remove any "sentences" that are actually empty and
            # any tokens that are pure punctuation.
            for i in reversed(range(len(tokens))):
                for j in reversed(range(len(tokens[i]))):
                    non_punct = ''.join([tok for tok in tokens[i][j] if tok not in punct])
                    if len(non_punct) == 0:
                        del tokens[i][j]

                if len(tokens[i]) == 0:
                    del tokens[i]

            # Make sure there is still at least one sentence left.
            num_sents = len(tokens)
            if num_sents == 0:
                continue

            # Add block to speeches dictionary.
            speaker = speaker[:speaker_pattern.match(speaker).end() - 1]
            if speaker not in speeches:
                speeches[speaker] = []
            speeches[speaker].append(orig_block)
            speeches[speaker].append(' '.join(tagged_text))
            speeches[speaker].append('\n'.join([' '.join(sent) for sent in tokens]))
            #print speeches[speaker][0]
            #print speeches[speaker][1]
            #print speeches[speaker][2]

            num_blocks += 1
            num_tokens = 0
            for toks in tokens:
                num_tokens += len(toks)
            block_lengths.append(num_tokens)

    # Save each speaker's text to a file.
    (infolder, basename) = os.path.split(os.path.abspath(args.infile))
    out_prefix = infolder + '/'
    out_suffix = basename
    for speaker in speeches:
        # Create outfile prefixed by speaker's name.
        outfile = open(out_prefix + speaker + '-' + out_suffix, 'w')

        # Save text to outfile.
        blocks = speeches[speaker]
        for i in range(0, len(blocks), 3):
            print >> outfile, blocks[i]
            print >> outfile, blocks[i + 1]
            print >> outfile, blocks[i + 2]
            print >> outfile

        outfile.close()

    print '# of blocks: ' + str(num_blocks)
    print 'Mean # of tokens (per block): ' + str(scipy.mean(block_lengths))
    print 'Median # of tokens: ' + str(scipy.median(block_lengths))
    print 'Standard deviation in # of tokens: ' + str(scipy.std(block_lengths))
示例#55
0
def main():

    text = raw_input('Enter a question...\n')
    print text
    graph_db = neo4j.GraphDatabaseService()
    batch = neo4j.WriteBatch(graph_db)

    TreeBankTokenizer = TreebankWordTokenizer()
    PunktTokenizer = PunktSentenceTokenizer()


    qIdentifiers = {
                    "What": ' ',
                    "Who": 'PERSON',
                    "Where": 'GPE',
                    "When": 'TIME',
                    "Why":'',
                    "How":''
                    }
    entities = []
    tokens = []
    for sentence in PunktTokenizer.tokenize(text):
            chunks = ne_chunk(pos_tag(TreeBankTokenizer.tokenize(sentence)))
            for chunk in chunks:
             if hasattr(chunk,'node'):
                tmp_tree = nltk.Tree(chunk.node, [(''.join(c[0] for c in chunk.leaves()))])
                tokens.append(tmp_tree)
             else:
                tokens.append(chunk[0])
            entities.extend([chunk for chunk in chunks if hasattr(chunk,'node')])

            #print chunks



    print tokens
    #entities dict
    entities_dict = {}
    for entity in entities:
        leaves = entity.leaves()
        if len(leaves) > 1 :
         entities_dict[entity.leaves()[0][0]+entity.leaves()[1][0]] = entity.node
        else :
         entities_dict[entity.leaves()[0][0]] = entity.node

    print entities_dict

    class doc():pass
    doc.headline=['']
    doc.text = tokens



  # Q&A answering algorithm
    #  Find the type of question
    qId = ''
    for key in qIdentifiers.keys():
      if key in str(text):
         print key
         qId = qIdentifiers[key]
    #  Find what kind of answer is required
    answerType = qId
    #  Find start node
    start_node = entities_dict.keys()[0]
    start_node_type = entities_dict[start_node]
    #  Run string similarity between relation text and question text
       # for the time being reading from the file

    #  Build query
    cypherQuery = "START me=node:objects(name='" + start_node + "') MATCH me-[r]->obj  RETURN r,obj.name LIMIT 10 "
    #  Start Graph traversal
    query = neo4j.CypherQuery(graph_db, cypherQuery)
    for record in query.stream():
      print 'printing records'
      print record[0]
      print record[1]
      print '\n'