示例#1
0
def tokenize():
    text = request.json["text"]
    try:
        spans = list(TreebankWordTokenizer().span_tokenize(text))
    except LookupError:
        nltk.download('punkt')
        spans = list(TreebankWordTokenizer().span_tokenize(text))
    return {"tokens": [(s[0], s[1], text[s[0]:s[1]]) for s in spans]}
示例#2
0
async def tokenize(request: Request):
    body = await request.json()
    text = body["text"]
    print(text)
    try:
        spans = list(TreebankWordTokenizer().span_tokenize(text))
    except LookupError:
        nltk.download('punkt')
        spans = list(TreebankWordTokenizer().span_tokenize(text))
    return {"tokens": [(s[0], s[1], text[s[0]:s[1]]) for s in spans]}
示例#3
0
 def get_tf_idf_score(self, sentence, mode, ngram=1):
     if ngram not in range(1, 4):
         try:
             raise ValueError
         except ValueError as v:
             print "Only unigrams, bigrams and trigrams are supported."
     if mode != "lex" and mode != "pos":
         try:
             raise ValueError
         except ValueError as v:
             print "Only lexical and POS distinctness supported."
     if len(self.document_freqs_lex.keys()) == 0 or len(
             self.document_freqs_pos.keys()) == 0:
         try:
             raise AttributeError
         except AttributeError as ae:
             print "Document frequency dictionaries not initialized. Call load_doc_freqs() " \
                   "on the LM object."
     tokenizer = TreebankWordTokenizer()
     sentence = sentence.lower()
     tokens = tokenizer.tokenize(sentence)
     tokens = self.__fix_tokens(tokens)
     tags = nltk.pos_tag(tokens)
     tags = self.__add_start_end_tags(tags)
     if mode == "lex":
         score = self.__get_lex_tf_idf(tags, ngram)
         return score
     else:
         score = self.__get_pos_tf_idf(tags, ngram)
         return score
示例#4
0
def term_frequency(sentence, ngrams=4):
    """Given a sentence, calculates term frequency of tuples.

    Parameters
    ----------
    sentence : str
        Sentence whose term frequency has to be calculated.
    ngrams : int
        Number of n-grams for which term frequency is calculated.

    Returns
    -------
    dict
        {tuple : int} key-value pairs representing term frequency.
    """
    sentence = sentence.lower().strip()
    for punc in PUNCTUATIONS:
        sentence = sentence.replace(punc, "")
    words = TreebankWordTokenizer().tokenize(sentence)
    counts = {}
    for i in range(ngrams):
        for j in range(len(words) - i):
            ngram = tuple(words[j:(j + i + 1)])
            if ngram in counts:
                counts[ngram] += 1
            else:
                counts[ngram] = 1
    return counts
示例#5
0
    def __init__(self, *args, **kwargs):
        if 'tokenize' in kwargs:
            raise TypeError(
                '``TreebankEncoder`` does not take keyword argument ``tokenize``.'
            )

        if 'detokenize' in kwargs:
            raise TypeError(
                '``TreebankEncoder`` does not take keyword argument ``detokenize``.'
            )

        try:
            import nltk

            # Required for moses
            nltk.download('perluniprops')
            nltk.download('nonbreaking_prefixes')

            from nltk.tokenize.treebank import TreebankWordTokenizer
            from nltk.tokenize.treebank import TreebankWordDetokenizer
        except ImportError:
            print("Please install NLTK. "
                  "See the docs at http://nltk.org for more information.")
            raise

        super().__init__(*args,
                         tokenize=TreebankWordTokenizer().tokenize,
                         detokenize=TreebankWordDetokenizer().detokenize,
                         **kwargs)
示例#6
0
    def __init__(self, sentences_file, stopwords):
        self.dictionary = None
        self.corpus = None
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        documents = list()
        count = 0
        print "Gathering sentences and removing stopwords"
        for line in f_sentences:
            line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)

            # remove stop words and tokenize
            document = [
                word for word in TreebankWordTokenizer().tokenize(line.lower())
                if word not in stopwords
            ]
            documents.append(document)
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")

        f_sentences.close()

        self.dictionary = corpora.Dictionary(documents)
        self.corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.tf_idf_model = TfidfModel(self.corpus)

        # print(documents)
        print len(documents), "documents read"
        print len(self.dictionary), " unique tokens", self.dictionary
示例#7
0
def generate_syntactically_similar_sentences_replace(num_of_perturb, dataset):
	"""Generate syntactically similar sentences for each sentence in the dataset.
	For PaInv-Replace
	Returns dictionary of original sentence to list of generated sentences
	"""
	# Use nltk treebank tokenizer and detokenizer
	tokenizer = TreebankWordTokenizer()
	detokenizer = TreebankWordDetokenizer()

	# Stopwords from nltk
	stopWords = list(set(stopwords.words('english')))

	# File from which sentences are read
	file = open(dataset, "r")

	# when we use Bert
	berttokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
	bertmodel = BertForMaskedLM.from_pretrained('bert-large-uncased')
	bertmodel.eval()

	# Number of perturbations you want to make for a word in a sentence
	dic = {}
	num_of_perturb = 50
	num_sent = 0
	for line in file:
		s_list = line.split("\n")
		source_sent = s_list[0]
		# Generating new sentences using BERT
		new_sents = perturb(source_sent, bertmodel, num_of_perturb)
		dic[line] = new_sents		
		if new_sents != []:
			num_sent += 1
	return dic
示例#8
0
def tokenize_text(text, language="english"):
    '''Tokenize a string into a list of tokens.
    Use NLTK's Treebankwordtokenizer.
    Note that we first split into sentences using NLTK's sent_tokenize.
    We additionally call a filtering function to remove un-wanted tokens.
    
    IN:
    - text, str
    OUT:
    - list of strings
    '''
    ## list of tokens
    list_tokens = []

    ## split text into sentences
    sentences = sent_tokenize(text, language=language)

    ## define the tokenizer
    tokenizer = TreebankWordTokenizer()
    ## loop over all sentences
    for sent in sentences:
        ## tokenize the sentence
        sent_tokenized = tokenizer.tokenize(sent)
        ## lowercase the tokens
        ## add tokens to list of tokens
        list_tokens += sent_tokenized
    list_tokens = filter_tokens(list_tokens)
    return list_tokens
def tokenize(documents):
    real_tokens = []
    documents2 = []
    tbw = TreebankWordTokenizer()
    for doc in documents:
        text = doc["text"]
        file = doc["id"]
        text = text.replace("\"","'")
        #text = text.replace("/", " ")
        text = text.replace("-", " ")
        text = text.replace(".", " ")
        tokens = tbw.span_tokenize(text)
        for token in tokens:
            token_txt = text[token[0]:token[1]]
            found = False
            for tag in doc["tags"]:
                if int(tag["start"])<=token[0] and int(tag["end"])>=token[1]:
                    token_tag = tag["tag"]
                    token_tag_type = tag["type"]
                    found = True
            if found==False:
                token_tag = "O"
                token_tag_type = "O"

            real_tokens.append({"token":token_txt,"start":token[0],"end":token[1],"tag":token_tag,"tag_type":token_tag_type})
        documents2.append({"id": file, "text": text, "tags": doc["tags"],"tokens":real_tokens})
    return documents2
示例#10
0
 def __init__(self, data, tokenizer):
     self._text = to_unicode(data).strip()
     self._tokenizer = tokenizer
     self._treebank_word_tokenize = TreebankWordTokenizer().tokenize
     self.formdocument()
     self.extractsentences()
     self.extractwords()
示例#11
0
    def __init__(self):
        filename = 'Models/CRF_crfsuite_dict.crfsuite'
        self.crf_model = pycrfsuite.Tagger()
        self._treebank_word_tokenizer = TreebankWordTokenizer()
        country_file = open("Dictionaries/Countries.txt",'r', encoding='utf-8')
        self.dictionary_country = country_file.readlines()
        self.dictionary_country = set([line[:-1] for line in self.dictionary_country])
        city_file = open("Dictionaries/Cities.txt",'r', encoding='utf-8')
        self.dictionary_city = city_file.readlines()
        self.dictionary_city = set([line[:-1] for line in self.dictionary_city])

        first_name_file = open("Dictionaries/dictionary_first_names.txt", 'r', encoding='utf-8')
        self.dictionary_first_name = first_name_file.readlines()
        self.dictionary_first_name = set([line[:-1].lower() for line in self.dictionary_first_name])

        surname_file = open("Dictionaries/dictionary_surnames.txt", 'r', encoding='utf-8')
        self.dictionary_surname = surname_file.readlines()
        self.dictionary_surname = set([line[:-1].lower() for line in self.dictionary_surname])

        if os.path.exists(filename):
            self.crf_model.open('Models/CRF_crfsuite_dict.crfsuite')
        else:
            self.crf_model = None
        self.dictionary_job_titles = []
        with open('Dictionaries/job_title_dictionary.csv', encoding='utf-8') as csv_file:
            csv_reader = csv.reader(csv_file,delimiter=',')
            for row in csv_reader:
                if row[2]=='assignedrole':
                    candidates = row[0].lower().split(' ')
                    for can in candidates:
                        if len(can)>2:
                            self.dictionary_job_titles.append(can)
        self.dictionary_job_titles = set(self.dictionary_job_titles)
        pass
示例#12
0
    def __init__(self, *args, **kwargs):
        if 'tokenize' in kwargs:
            raise TypeError(
                'TreebankEncoder defines a tokenize callable TreebankWordTokenizer'
            )

        try:
            import nltk

            # Required for moses
            nltk.download('perluniprops')
            nltk.download('nonbreaking_prefixes')

            from nltk.tokenize.treebank import TreebankWordTokenizer
            from nltk.tokenize.treebank import TreebankWordDetokenizer
        except ImportError:
            print("Please install NLTK. "
                  "See the docs at http://nltk.org for more information.")
            raise

        self.detokenizer = TreebankWordDetokenizer()

        super().__init__(*args,
                         **kwargs,
                         tokenize=TreebankWordTokenizer().tokenize)
示例#13
0
def create_data(stories, lang="english", doc_limit=-1, delimiter=""):
    from nltk.tokenize.treebank import TreebankWordTokenizer
    tokenizer = TreebankWordTokenizer()

    from nltk.corpus import stopwords
    stop = stopwords.words('english')

    from string import ascii_lowercase

    docs = {}
    print("Found %i stories" % stories.count())
    for story in stories:
        text = zlib.decompress(story.story_content_z)
        # text = story.story_title
        text = ''.join(
            BeautifulSoup(text, features="lxml").findAll(text=True)).lower()
        if delimiter:
            sections = text.split(delimiter)
        else:
            sections = [text]

        if doc_limit > 0 and len(docs) > doc_limit:
            print("Passed doc limit %i" % len(docs))
            break
        print(story.story_title, len(sections))

        for jj in xrange(len(sections)):
            docs["%s-%i" % (story.story_title, jj)] = [x for x in tokenizer.tokenize(sections[jj]) \
                                        if (not x in stop) and \
                                        (min(y in ascii_lowercase for y in x))]
    return docs
    def __init__(self):
        '''
        Constructor
        '''
        self.__tokenizer = TreebankWordTokenizer()

        self.__r_end_sentence = re.compile(r"\.|\?|!")
示例#15
0
def word_tokenize(text, language="spanish"):
    """
		It splits the text into words
		
		Args:
			text:		text to be splited
			language:	language of the tokenizer to be used
			
		Returns:
			List of words
	"""

    #try to use from local
    try:
        from nltk.tokenize.treebank import TreebankWordTokenizer

        _treebank_word_tokenize = TreebankWordTokenizer().tokenize

        return [
            token for sent in sent_tokenize(text)
            for token in _treebank_word_tokenize(sent)
        ]

    #if not, use nltk
    except IOError:
        from nltk import word_tokenize

        return word_tokenize(text, language)
示例#16
0
    def __init__(self, language):
        """Take language as argument to the class. Check availability and
        setup class variables."""
        self.language = language
        self.available_languages = [
            'akkadian',
            'arabic',
            'french',  # defaults to old_french
            'greek',
            'latin',
            'middle_english',
            'middle_french',
            'middle_high_german',
            'old_french',
            'old_norse',
            'sanskrit',
            'multilingual'
        ]

        assert self.language in self.available_languages, \
            "Specific tokenizer not available for '{0}'. Only available for: '{1}'.".format(
                self.language,
                self.available_languages)

        # raise languages-specific warnings
        if self.language == 'french':
            self.language = 'old_french'
            LOG.warning("'french' defaults to 'old_french'. 'middle_french' also available.")  # pylint: disable=line-too-long

        if self.language == 'arabic':
            self.toker = BaseArabyWordTokenizer('arabic')
        elif self.language == 'french':
            self.toker = BaseRegexWordTokenizer('old_french',
                                                OldFrenchTokenizerPatterns)
        elif self.language == 'greek':
            self.toker = BasePunktWordTokenizer('greek',
                                                GreekRegexSentenceTokenizer)
        elif self.language == 'latin':
            self.toker = LatinWordTokenizer()
        elif self.language == 'old_norse':
            self.toker = BaseRegexWordTokenizer('old_norse',
                                                OldNorseTokenizerPatterns)
        elif self.language == 'middle_english':
            self.toker = BaseRegexWordTokenizer(
                'middle_english', MiddleEnglishTokenizerPatterns)
        elif self.language == 'middle_french':
            self.toker = BaseRegexWordTokenizer('old_french',
                                                OldFrenchTokenizerPatterns)
        elif self.language == 'middle_high_german':
            self.toker = BaseRegexWordTokenizer(
                'middle_high_german', MiddleHighGermanTokenizerPatterns)
        elif self.language == 'old_french':
            self.toker = BaseRegexWordTokenizer('old_french',
                                                OldFrenchTokenizerPatterns)
        else:
            LOG.warning(
                "Falling back to default tokenizer, the NLTK's `TreebankWordTokenizer()`."
            )
            self.toker = TreebankWordTokenizer()
示例#17
0
def normalize(text):
    text = strip_accents_ascii(text.decode('utf-8'))
    text = text.encode('utf-8')
    text = ' '.join(
        map(lambda x: x.lower(),
            TreebankWordTokenizer().tokenize(text)))
    text = str(TextBlob(text).correct())
    return text
示例#18
0
def text2sentences(path):
    # feel free to make a better tokenization/pre-processing
    sentences = []
    tokenizer = TreebankWordTokenizer()
    with open(path , encoding = 'utf8') as f:
        for l in f:
            table = str.maketrans(dict.fromkeys(string.punctuation + '0123456789')) #to remove numbers & punctuation
            sentences.append( tokenizer.tokenize(l.translate(table).lower()) )
    return sentences
def treebank_tokenizer(sentence):
    # split 's but also split <>, wait to use in further work
    t = TreebankWordTokenizer()
    word_lst = t.tokenize(sentence.lower().replace("<", "LAB_").replace(
        ">", "_RAB"))
    ret = []
    for w in word_lst:
        ret.append(w.replace("LAB_", "<").replace("_RAB", ">"))
    return ret
示例#20
0
 def tokenize(self, text: str):
     """
     :rtype: list
     :param text: text to be tokenized into sentences
     :type text: str
     """
     sents = self.sent_tokenizer.tokenize(text)
     tokenizer = TreebankWordTokenizer()
     return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
示例#21
0
    def __init__(self):
        filename = 'Models/CRF_crfsuite.crfsuite'
        self.crf_model = pycrfsuite.Tagger()
        self._treebank_word_tokenizer = TreebankWordTokenizer()
        if os.path.exists(filename):
            self.crf_model.open('Models/CRF_crfsuite.crfsuite')
        else:
            self.crf_model = None

        pass
示例#22
0
def normalize(text):
    text = text.decode('utf-8')
    text = re.sub(r'[a-zA-z]+://[^\s]*', '', text)
    text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text)
    text = strip_accents_ascii(text)
    text = text.encode('utf-8')
    text = ' '.join(
        map(lambda x: x.lower(),
            TreebankWordTokenizer().tokenize(text)))
    return text
示例#23
0
def treebank_tokenizer(sentence):
    tokenizer = load('data/german.pickle')
    treebank_word_tokenize = TreebankWordTokenizer().tokenize
    tokens = []
    for s in tokenizer.tokenize(sentence):
        tokens.extend([token for token in treebank_word_tokenize(s)])
    tokens = [''.join(i for i in s if i not in string.punctuation)
              for s in tokens]
    tokens = list(filter(None, tokens))
    return tokens
def english_tokenization(term):
  word_tokenizer = TreebankWordTokenizer()
  tokenized_term = ""

  for word in word_tokenizer.tokenize(term):
    if tokenized_term != "":
      tokenized_term += " "
    tokenized_term += word

  return tokenized_term
示例#25
0
def get_words_locations(txt):
    sents = PunktSentenceTokenizer().tokenize(txt)
    words = [TreebankWordTokenizer().tokenize(i) for i in sents]
    words = [i for word in words for i in word]
    span_sents = [i for i in PunktSentenceTokenizer().span_tokenize(txt)]
    span_words = [[j for j in TreebankWordTokenizer().span_tokenize(i)]
                  for i in PunktSentenceTokenizer().tokenize(txt)]
    new_span_words = []
    for i, j in zip(span_sents, span_words):
        new_span_words_in_sent = []
        for k in j:
            new_span_words_in_sent.append((k[0] + i[0], k[1] + i[0]))
        new_span_words.append(new_span_words_in_sent)
    new_span_words = [i for span_word in new_span_words for i in span_word]
    _words = []
    _span_words = []
    for i, j in zip(words, new_span_words):
        if i not in string.punctuation:
            punkt_list = re.split('\W', i.lower())
            if len(punkt_list) >= 2 and not all(
                [ii.isnumeric() for ii in punkt_list]):
                mm = j[0]
                for k in punkt_list:
                    if k == '':
                        mm += 1
                    elif re.match('^\d+[a-zA-Z]+$', k):
                        k0 = re.search('(^\d+)', k).group()
                        k1 = re.search('([a-zA-Z]+$)', k).group()
                        _words.extend([k0, k1])
                        _span_words.append((mm, mm + len(k0)))
                        mm += len(k0)
                        _span_words.append((mm, mm + len(k1)))
                        mm += len(k1)
                        mm += 1
                    else:
                        _words.append(k)
                        _span_words.append((mm, mm + len(k)))
                        mm += len(k)
                        mm += 1
            else:
                _words.append(i.lower())
                _span_words.append(j)
    return _words, _span_words
def transform_texts(art,
                    period,
                    site,
                    ngrams=1,
                    mod=None,
                    text_column='text',
                    text_token_column='text_token',
                    remain_columns=('author', 'site', 'link')):
    """Transform dataframe with texts, create tokenized lists in columns.
    Save dataframe to mod directory, if mod is not None."""
    text_column_paragraphs = text_column + '_paragraphs'
    text_token_column_lower = text_token_column + '_lower'
    text_token_column_stemmed = text_token_column + '_stemmed'
    text_token_column_count = text_token_column + '_count'

    st = SnowballStemmer('english')
    art.dropna(subset=[text_column],
               inplace=True)  # maketrans fails if there are nans
    art_sh = art[list((text_column, ) +
                      remain_columns)].copy()  # we don't need more columns
    del art
    gc.collect()

    additional_punctuation = string.punctuation + '«»…—’‘“”–•'  # a few additional, non-ascii chars
    # gigaom
    tt = TreebankWordTokenizer()
    art_sh[text_column] = art_sh[text_column].apply(
        lambda x: x.replace('Tweet\nShare\nPost\n', '').replace(
            '“', '').replace('”', '').replace('’', '\''))
    # sent_tokenize tokenizes by paragraphs
    art_sh[text_column_paragraphs] = art_sh[text_column].apply(
        lambda x: x.split('\n\n'))
    art_sh[text_token_column] = art_sh[text_column_paragraphs].apply(
        lambda x:
        [flatten([tt.tokenize(z) for z in sent_tokenize(y)]) for y in x])
    # to lower, stem
    art_sh[text_token_column_lower] = art_sh[text_token_column].apply(
        lambda x: [[word.lower() for word in paragraph] for paragraph in x])
    art_sh[text_token_column_stemmed] = art_sh[text_token_column_lower].apply(
        lambda x: [[st.stem(word) for word in paragraph] for paragraph in x])
    if ngrams == 2:  # convert to bigrams
        art_sh[text_token_column] = art_sh[text_token_column_lower].apply(
            to_bigram)
        art_sh[text_token_column_lower] = art_sh[
            text_token_column_lower].apply(to_bigram)
        art_sh[text_token_column_stemmed] = art_sh[
            text_token_column_stemmed].apply(to_bigram)

    art_sh[text_token_column_count] = art_sh[text_token_column_stemmed].apply(
        lambda x: dict(Counter(FreqDist(flatten(x)))))

    if mod is not None:
        art_sh.to_csv(mod + 'dfs_articles' + period + site + '.csv')

    return art_sh
示例#27
0
def tokenize(review: str) -> list:
    """Tokenize string based on NLTK TreebankWordTokenizer.

    Args:
        review: The raw review content.

    Returns:
        A list of tokens found by the NLTK tokenizer.
    """
    tokenizer = TreebankWordTokenizer()
    return tokenizer.tokenize(review)
示例#28
0
def tokenize(sents):
    """Identifica los tokens del las oraciones de entrada
    
    Returns:
        Una lista de oraciones. Cada oración es una lista de tokens
    """
    tokenizer = TreebankWordTokenizer()

    sent_tokens = [tokenizer.tokenize(sent) for sent in sents]

    return sent_tokens
示例#29
0
    def __init__(self, tokenizer_method: str = "TreebankWordTokenizer"):
        self.token2idx = {}
        self.tokenizer = None

        if tokenizer_method == "TreebankWordTokenizer":
            self.tokenizer = TreebankWordTokenizer()
        else:
            raise NotImplementedError(
                "tokenizer_method {} doesn't exist".format(tokenizer_method))

        self.add_token(UNK_TOKEN)  # Add UNK token
示例#30
0
 def __init__(self):
     # nltk.download('punkt')
     self.tk = TreebankWordTokenizer()
     self.dtk = TreebankWordDetokenizer()
     self.BAD_CAT_REMOVE = re.compile('^Cat_')
     self.A_TILDE_REMOVE = re.compile('[á]')
     self.E_TILDE_REMOVE = re.compile('[é]')
     self.I_TILDE_REMOVE = re.compile('[í]')
     self.O_TILDE_REMOVE = re.compile('[ó]')
     self.U_TILDE_REMOVE = re.compile('[ú]')
     self.POINT_FOLLOWING_LETTER = re.compile('(?<=\S)\.(?=\w)')