def text2sents(text, lemmatize=False, stemmer=None):
    """
    converts a text into a list of sentences consisted of normalized words
    :param text: list of string to process
    :param lemmatize: if true, words will be lemmatized, otherwise -- stemmed
    :param stemmer: stemmer to be used, if None, PortedStemmer is used. Only applyed if lemmatize==False
    :return: list of lists of words
    """
    sents = sent_tokenize(text)

    tokenizer = RegexpTokenizer(r'\w+')

    if lemmatize:
        normalizer = WordNetLemmatizer()
        tagger = PerceptronTagger()
    elif stemmer is None:
        normalizer = PorterStemmer()
    else:
        normalizer = stemmer

    sents_normalized = []

    for sent in sents:
        sent_tokenized = tokenizer.tokenize(sent)
        if lemmatize:
            sent_tagged = tagger.tag(sent_tokenized)
            sent_normalized = [normalizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in sent_tagged]
        else:
            sent_normalized = [normalizer.stem(w) for w in sent_tokenized]

        sents_normalized.append(sent_normalized)
    return sents_normalized
    def get_tagger():
        """

        Returns:

        """
        return PerceptronTagger()
Пример #3
0
    def pos_tag_reviews(records):
        print('%s: tag reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        tagger = PerceptronTagger()

        for record in records:
            tagged_words =\
                nlp_utils.tag_words(record[Constants.TEXT_FIELD], tagger)
            record[Constants.POS_TAGS_FIELD] = tagged_words
Пример #4
0
def clean_text(text,
               stopwords,
               remove_stopwords=True,
               pos_filtering=False,
               stemming=True,
               lower_case=True):
    if lower_case:
        # convert to lower case
        text = text.lower()
    # strip extra white space
    text = re.sub(' +', ' ', text)
    # strip leading and trailing white space
    text = text.strip()
    # tokenize (split based on whitespace)
    tokens = text.split(' ')

    # remove punctuation
    tokens = [t for t in tokens if t not in string.punctuation]

    if pos_filtering:
        tagger = PerceptronTagger()
        # apply POS-tagging
        tagged_tokens = tagger.tag(tokens)
        # retain only nouns and adjectives
        tokens = [
            item[0] for item in tagged_tokens if item[1] in [
                'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJS', 'JJR', 'VB', 'VBD',
                'VBG', 'VBN', 'VBP', 'VBZ'
            ]
        ]
    if remove_stopwords:
        # remove stopwords
        tokens = [token for token in tokens if token.lower() not in stopwords]
    if stemming:
        stemmer = nltk.stem.PorterStemmer()
        # apply Porter's stemmer
        tokens_stemmed = list()
        for token in tokens:
            tokens_stemmed.append(stemmer.stem(token))
        tokens = tokens_stemmed

    return (tokens)
Пример #5
0
class Syntax(Characteristic):
    """
    Accounts for the syntactic aspects of the source text.
        Word-for-word:  Parses and stores part-of-speech (POS) tags.
        Entire text:    Enumerates all configurations of clause and sentence found in
                        the text.
    """
    POS_TAG = PerceptronTagger()

    def __init__(self, tokens):
        Characteristic.__init__(self)

    def pos_tag(self):
        pass

    def find_clauses(self):
        pass

    def find_sentences(self):
        pass
Пример #6
0
if domain == 'meeting':
    path_to_stopwords = path_to_root + 'resources/stopwords/meeting/stopwords.' + language + '.dat'
    path_to_filler_words = path_to_root + 'resources/stopwords/meeting/filler_words.' + language + '.txt'
    stopwords = utils.load_stopwords(path_to_stopwords)
    filler_words = utils.load_filler_words(path_to_filler_words)

    if dataset_id == 'ami':
        ids = meeting_lists.ami_development_set + meeting_lists.ami_test_set
    elif dataset_id == 'icsi':
        ids = meeting_lists.icsi_development_set + meeting_lists.icsi_test_set

if language == 'en':
    path_to_word2vec_keys = path_to_root + 'resources/word2vec_keys.txt'
# tokenizer = DictionaryTokenizer(path_to_word2vec_keys) # highly time-consuming
# tokenizer = TweetTokenizer()
tagger = PerceptronTagger()

# ######################
# ### CORPUS LOADING ###
# ######################
corpus = {}
for id in ids:
    if domain == 'meeting':
        if dataset_id == 'ami' or dataset_id == 'icsi':
            if source == 'asr':
                path = path_to_root + 'data/meeting/' + dataset_id + '/' + id + '.da-asr'
            elif source == 'manual':
                path = path_to_root + 'data/meeting/' + dataset_id + '/' + id + '.da'
            # filler words will be removed during corpus loading
            corpus[id] = utils.read_ami_icsi(path, filler_words)