Пример #1
0
    def extract_candidate_chunks(self,text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
        """ Extract candidate chunks from the sentence given and follow the pattern listed """
        import itertools, nltk, string
        print('Inside extract_candidate_chunks ...')

        # exclude candidates that are stop words or entirely punctuation
        punct = set(string.punctuation)
        stop_words = set(nltk.corpus.stopwords.words('english'))

        # tokenize, POS-tag, and chunk using regular expressions and creating the
        # chunk.RegexpParser with 1 stages:
        #     RegexpChunkParser with 1 rules:
        #            <ChunkRule: '(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+'>
        # see parsing vs chunking @http://nltk.sourceforge.net/doc/en/ch06.html
        chunker = nltk.chunk.regexp.RegexpParser(grammar)

        # this will tag the word with probable pos
        tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))

        # chunks the data in IOB-tags which means tagged with one of three special chunk tags,
        # I (inside), O (outside), or B (begin)
        all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))for tagged_sent in tagged_sents))

        # join constituent chunk words into a single chunked phrase
        # 1. all_chunks is like (word,pos,chunk) and we are neglecting the words which are outside the in the IOB tags
        # 2. groupby with chunks
        # 3. Lower case the chunks and join B and I
        candidates = [' '.join(word for word, pos, chunk in group).lower()
            for key, group in itertools.groupby(all_chunks, lambda chunk: chunk != 'O') if key]
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
        # return all the list of chunks after removing all the stopwords
        return [cand for cand in candidates
                if cand not in stop_words and not all(char in punct for char in cand)]
Пример #2
0
    def extract_candidate_words(self,text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
        """ Here will get all the words which are eligigle for keywords """
        import itertools, nltk, string
        print('Inside extract_candidate_words')

        # exclude candidates that are stop words or entirely punctuation
        # 1. Getting the punctuation() list and storing
        # 2. Getting all the stopwords list and storing
        punct = set(string.punctuation)
        stop_words = set(nltk.corpus.stopwords.words('english'))

        # tokenize and POS-tag words
        # 1. tokenizing the string and an array is sent 
        # 2. the string is splits the token into words
        # 3. attach pos-tag to each and every word 
        tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(self.__initString__)))

        # filter on certain POS tags and lowercase all words
        # 1. Iterate all word and tags and lower-case 
        # 2. Then qualify only the those tags which are there in the good_tags and
        #       check whether it is not present in the Stopwords
        # 3. And also remove the punctuation()
        candidates = [word.lower() for word, tag in tagged_words
                    if tag in good_tags and word.lower() not in stop_words
                    and not all(char in punct for char in word)]

        return candidates
Пример #3
0
def pipeline_pos(titles, descriptions, tags):
    def preprocess(inpt):
        return inpt

    # Create feature vectors of context and only keep images WITH context
    bar = Bar('Extracting features...', max=len(titles))
    pos_collection = []
    for i in xrange(len(titles)):
        # Stem words and remove stopwords for title...
        context = []
        title = preprocess(titles[i].split(' '))
        if title:
            context.append(title)
        # ... description (for each sentence) ...
        for desc in sent_tokenize(descriptions[i]):
            desc = preprocess(desc.split(' '))
            if desc:
                context.append(desc)
        # ... and tagsc
        ts = preprocess(tags[i])
        if ts:
            context.append(ts)
        
        pos = nltk.pos_tag_sents(context)
        pos = list(itertools.chain(*pos))
        pos_collection.append(pos)
        bar.next()
    bar.finish()

    return pos_collection
Пример #4
0
def readGenreBasedFilesAndTagWords(genre_to_file_list, meta_dict, tagger=None):
    for genre in genre_to_file_list:
        meta_dict_for_genre = meta_dict[genre]
        print '--------------------------------------------------------------'
        print 'Number of Files in genre ',genre,' : ',len(meta_dict_for_genre)
        for genre_file_path,genre_file_name in genre_to_file_list[genre]:
            if genre_file_name not in meta_dict_for_genre:
                continue
            pos_tag_dict = dict()
            with open(genre_file_path) as f:
                filelines = f.readlines()
                tokens = [ [word  for word in line.split()] for line in filelines]
                pos_tagged_lines = []
                if tagger != None:
                    pos_tagged_lines = tagger.tag_sents(tokens)
                else:
                    pos_tagged_lines = nltk.pos_tag_sents(tokens)
                for pos_tags in pos_tagged_lines:
                    for word,tag in pos_tags:
                        if tag not in pos_tag_dict:
                            pos_tag_dict[tag] = 0.0
                        pos_tag_dict[tag]+= 1.0
            total_tags = sum(pos_tag_dict.values())
            pos_tag_dict = {key:(pos_tag_dict[key]/total_tags) for key in pos_tag_dict}
            meta_dict_for_genre_file = meta_dict_for_genre[genre_file_name]
            meta_dict_for_genre_file[TAGS] = pos_tag_dict
        print 'Genre ', genre, ' Done'
        print '--------------------------------------------------------------'
Пример #5
0
    def process(self):
        """
        process()

        Splits assigned text into list of words, normalizes them and
        creates a frequency distribution

        If no text is assigned - throws SourceNotAssigned exception

        Stores inside self.freq_dist a list of dicts:
        [{word, word_pos, word_freq}]
        word_pos is of database format (a,n,adv,v)

        Returns nothing
        """
        if not self._text:
            raise SourceNotAssigned
        pos_dict = {"J": [], "N": [], "R": [], "V": []}
        t = [nltk.word_tokenize(sent) for sent in self._text]
        words = [tt for sent in nltk.pos_tag_sents(t) for tt in sent if tt[1] in ACCEPTABLE_POS]
        for word in words:
            pos_dict[word[1][0]].append(normalize(word))
        del t, words

        self.freq_dist = []
        for key in pos_dict:
            for word, freq in nltk.FreqDist(pos_dict[key]).most_common():
                self.freq_dist.append(dict(word=word, word_pos=tag_to_pos[key], word_freq=freq))
Пример #6
0
 def extract_candidate_words(self, text):
     # tokenize and POS-tag words
     tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
                                                                     for sent in nltk.sent_tokenize(text)))
     # filter on certain POS tags and lowercase all words
     candidates = [word.lower() for word, tag in tagged_words
                   if tag in self.good_tags and
                   word.lower() not in self.stop_words and
                   not all(char in self.punctuations for char in word)]
     return candidates
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(
        nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda (word, pos, chunk): chunk != 'O') if key]
Пример #8
0
 def extractTerms(self, doc):
     #doc = lambda doc: doc.decode('utf8', 'ignore')
     doc = doc.decode('utf-8')
     sents = nltk.sent_tokenize(doc)
     words = (nltk.word_tokenize(sent) for sent in sents)
     tagged_sents = nltk.pos_tag_sents(words)
     chunker = nltk.chunk.regexp.RegexpParser(self.grammer)
     chunked_sents = (chunker.parse(tagged_sent) for tagged_sent in tagged_sents)
     conll_tags = (nltk.chunk.tree2conlltags(chunked_sent) for chunked_sent in chunked_sents)
     all_chunks = list(itertools.chain.from_iterable(conll_tags))
     candidates = [' '.join(word for word, pos, chunk in group).lower()
             for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
def Extract_Candidate_Chunks(Text , Grammar):

    ## Defining a Chunker based on the Grammar we defined above .
    Chunker = nltk.chunk.regexp.RegexpParser(Grammar)
    ## Assigning POS Tags .
    Tagged_Sentences = nltk.pos_tag_sents( nltk.word_tokenize(sentence) for sentence in nltk.sent_tokenize(Text) )

    ## Gathering the Chunks made by the Chunker from the Tree created .
    Chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(Chunker.parse(Tagged_Sentences)) for Tagged_Sentences in Tagged_Sentences))

    ## We will now join the chunk words into a single Chunked Phrase .
    Candidates = [' '.join(Word for Word, Pos, Chunk in Group).lower() for Key, Group in itertools.groupby(Chunks, lambda (Word,Pos,Chunk): Chunk != 'O') if Key]
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
Пример #11
0
    def __extract_candidate_words(self, text, good_tags=set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS'])):
        # exclude candidates that are stop words or entirely punctuation
        punct = set(string.punctuation)
        # stop_words = set(nltk.corpus.stopwords.words('english'))
        # tokenize and POS-tag words
        tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
                                                                        for sent in nltk.sent_tokenize(text)))
        # filter on certain POS tags and lowercase all words
        candidates = [self.__trim(word) for word, tag in tagged_words
                      if tag in good_tags and word.lower() not in self.__stop_words
                      and not all(char in punct for char in word)]

        return candidates
Пример #12
0
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS','VB','VBD','VBG','VBN','VBP','VBZ'])):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize and POS-tag words
    tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
    for sent in nltk.sent_tokenize(text)))
    # filter on certain POS tags and lowercase all words
    candidates = [word.lower() for word, tag in tagged_words
                  if tag in good_tags and word.lower() not in stop_words
                  and not all(char in punct for char in word)]
    #print(candidates)
    return candidates
Пример #13
0
	def char_recognition(self, char_number = 20):
		tagged_sentences = nltk.pos_tag_sents(self.tokenized_sentences)
		self.entities = []
		entity_names = []
		if nltk.__version__[0] == '3':
			chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False)
			for tree in chunked_sentences:
				entity_names.extend(extract_entity_names3(tree))
		else:
			chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=False)
			for tree in chunked_sentences:
				entity_names.extend(extract_entity_names(tree))
		count = Counter([name for name in entity_names])
		for c in count.most_common(char_number):
			self.entities.append(c[0])
Пример #14
0
def batch_tag_sentences(message_dict):
    """
    Uses a more efficient way of tagging all sentences for a given
    message at once.

    """
    num_sentences = [len(page['sentences']) for page in message_dict['urls']]
    all_sentences = [word_tokenize(s['s_clean']) for page in message_dict['urls'] for s in page['sentences']]
    all_tags = pos_tag_sents(all_sentences)

    for page_index, slice_length in enumerate(num_sentences):
        slice_start = sum(num_sentences[:page_index])
        slice_end = slice_start + slice_length
        for sentence_index, tags in enumerate(all_tags[slice_start:slice_end]):
            pos_tags = ['/'.join(b) for b in tags]
            message_dict['urls'][page_index]['sentences'][sentence_index]['pos_tags'] = ' '.join(pos_tags)
def start(raw_text, user_sentence, length=10, is_debug=False):
    """
    Entry point.
    :param raw_text: the source text as a string
    :param user_sentence: a starting sentence, string, provided by the user
    :param length: how many words should we generate
    :return:
    """
    user_sentence = user_sentence.lower()

    sentences = process_raw_text(raw_text)
    lexica = [token for sent in sentences for token in sent]

    # When we create the bigrams for our matrices we process the whole text as a single list, without splitting it
    # into sentences. It causes a bug - if the last word of the list is unique no bigrams will ever start with this
    # word. There might be a couple of ways to circumvent this error, but in order to avoid any unforeseen issues in
    # future I simply add the first word of the text into the very end of the text, making sure that all the words are
    # located both in the left sides and in the right sides of our bigrams. This might slightly spoil our statistics,
    # however I am slightly randomizing statistics myself in order to get more unique results,
    # so this should not be an issue.
    if lexica.index(sentences[-1][-1]) == len(lexica) - 1:
        lexica.append(lexica[0])
        sentences[-1].append(lexica[0])

    # tagged_word_pairs = nltk.pos_tag(lexica)  # the shortcut - it's a bit worse, but only a bit
    tagged_word_pairs = [token for sent in nltk.pos_tag_sents(sentences) for token in sent]   # a list of ['word', 'POS']

    # processing user input
    tokenized_user_input = nltk.word_tokenize(user_sentence)
    user_input_pairs = nltk.pos_tag(tokenized_user_input)
    if (tokenized_user_input[-1]) not in lexica:
        return "Error! Please try a different word - the last word of your sentence is not present in the original text."
    try:
        number_of_words = int(length)
    except ValueError:
        return "Error! Please make sure to input a number!"

    probability_matrices = (generate_word_word_matrix(lexica, is_debug),
                            generate_word_pos_matrix(tagged_word_pairs, is_debug),
                            generate_pos_pos_matrix(tagged_word_pairs, is_debug)
                            )

    output = generate(user_input_pairs, probability_matrices, number_of_words)

    #output = generate([("i", "PRP")], probability_matrices, 42)

    return " ".join([pair[0] for pair in output])
Пример #16
0
def extract_grammar_phrases(corpus, phrase_grammar_pattern):
    # build phrase list based on grammar pattern
    all_phrases = []
    grammar_pattern = nltk.chunk.regexp.RegexpParser(phrase_grammar_pattern)

    sentences = normalize_document(corpus,
                                   esc_html=False,
                                   expand_cont=False,
                                   lemmatize=False,
                                   tokenize=False,
                                   remove_special_char=False,
                                   remove_stop_words=False)
    for sentence in sentences:
        # POS tag sentences
        tagged_sentences = nltk.pos_tag_sents([nltk.word_tokenize(sentence)])

        # extract phrases based on pattern
        phrases = [
            grammar_pattern.parse(tagged_sentence)
            for tagged_sentence in tagged_sentences
        ]

        # extract word, pos tag, tag triples
        wtc_sentences = [
            nltk.chunk.tree2conlltags(phrase) for phrase in phrases
        ]
        flattened_phrases = list(
            itertools.chain.from_iterable(wtc_sentence
                                          for wtc_sentence in wtc_sentences))

        # get valid phrase based on tags
        valid_phrases_tagged = [
            (status, [wtc for wtc in chunk])
            for status, chunk in itertools.groupby(
                flattened_phrases, lambda (word, pos, chunk): chunk != 'O')
        ]

        valid_phrases = [
            ' '.join(word.lower() for word, tag, chunk in wtc_group
                     if word.lower() not in stopword_list)
            for status, wtc_group in valid_phrases_tagged if status
        ]

        all_phrases.append(valid_phrases)

    return all_phrases
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda (word, pos, chunk): chunk != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]
def extract_candidate_words(
    text, good_tags=set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS'])):
    import itertools, nltk, string
    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize and POS-tag words
    tagged_words = itertools.chain.from_iterable(
        nltk.pos_tag_sents(
            nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)))
    # filter on certain POS tags and lowercase all words
    candidates = [
        word.lower() for word, tag in tagged_words
        if tag in good_tags and word.lower() not in stop_words and not all(
            char in punct for char in word)
    ]
    return candidates
Пример #19
0
 def char_recognition(self, char_number=20):
     tagged_sentences = nltk.pos_tag_sents(self.tokenized_sentences)
     self.entities = []
     entity_names = []
     if nltk.__version__[0] == '3':
         chunked_sentences = nltk.ne_chunk_sents(tagged_sentences,
                                                 binary=False)
         for tree in chunked_sentences:
             entity_names.extend(extract_entity_names3(tree))
     else:
         chunked_sentences = nltk.batch_ne_chunk(tagged_sentences,
                                                 binary=False)
         for tree in chunked_sentences:
             entity_names.extend(extract_entity_names(tree))
     count = Counter([name for name in entity_names])
     for c in count.most_common(char_number):
         self.entities.append(c[0])
def extract_candidate_chunks(text):

    import nltk, itertools, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))

    # tokenize, POS-tag, and chunk using regular expressions
    grammar = "KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}"
    chunker = nltk.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent)
                                      for sent in nltk.sent_tokenize(text))

    # ==== method 1======
    candidates_with_POS = []
    candidates = []
    tree_chunked_sents = [chunker.parse(tagged_sent) for tagged_sent in tagged_sents]
    for tree in tree_chunked_sents:
        for subtree in tree.subtrees():
            if subtree.label() == 'KT':
                candidates_with_POS.append(subtree.leaves())
    for cand in candidates_with_POS:
        NP = []
        for word, pos in cand:
            NP.append(word.lower())
        candidates.append(" ".join(NP))

    # ==== method 2======
    # BOI_tagged_chunked_sents = [nltk.tree2conlltags(tree_chunked_sent)
    #                             for tree_chunked_sent in tree_chunked_sents]
    # all_chunks = list(itertools.chain.from_iterable(BOI_tagged_chunked_sents))
    #
    # #   get all the NP Chunk and exclude all the non-NP chunk
    # groups = []
    # for key, group in itertools.groupby(all_chunks, lambda x : x[2]!='O'):
    #     if(key):
    #         groups.append(list(group))
    #
    # # get all the candidate except stopwords and punkt
    # candidates = [" ".join(word for word, pos, chunk in group).lower()
    #               for group in groups]
    candidates = [candidate for candidate in candidates
                  if candidate not in stop_words
                  and not all(char in punct for char in candidate)]
    return candidates
Пример #21
0
def generate_keyword(texts, method='phrase', remove_punctuation=False):
    """
    Generate word candidate from given string

    Parameterstfidf_matrix = tf.transform(resumes_list)

    ----------
    texts: str, input text string
    method: str, method to extract candidate words, either 'word' or 'phrase'

    Returns
    -------
    candidates: list, list of candidate words
    """
    words_ = list()
    candidates = list()

    # tokenize texts to list of sentences of words
    sentences = sent_tokenize(texts)
    for sentence in sentences:
        if remove_punctuation:
            sentence = punct_re.sub(' ', sentence)  # remove punctuation
        words = word_tokenize(sentence)
        words = list(map(lambda s: s.lower(), words))
        words_.append(words)
    tagged_words = pos_tag_sents(words_)  # POS tagging

    if method == 'word':
        tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS'])
        tagged_words = chain.from_iterable(tagged_words)
        for word, tag in tagged_words:
            if tag in tags and word.lower() not in stop_words:
                candidates.append(word)
    elif method == 'phrase':
        grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
        chunker = RegexpParser(grammar)
        all_tag = chain.from_iterable(
            [tree2conlltags(chunker.parse(tag)) for tag in tagged_words])
        for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'):
            candidate = ' '.join([word for (word, pos, chunk) in group])
            if key is True and candidate not in stop_words:
                candidates.append(candidate)
    else:
        print("Use word or phrase")
    return candidates
Пример #22
0
def extract_chunks(text_string, max_words=3, lemmatize=True):
    """
    Extract phrase nouns by using regex
    """
    # Any number of adjectives followed by any number of nouns and (optionally) again
    # any number of adjectives folowerd by any number of nouns
    grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'

    # Makes chunks using grammar regex
    chunker = nltk.RegexpParser(grammar)

    # Get grammatical functions of words
    # What this is doing: tag(sentence -> words)
    tagged_sents = nltk.pos_tag_sents(
        nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text_string))

    # Make chunks from the sentences, using grammar. Output in IOB.
    all_chunks = list(
        itertools.chain.from_iterable(
            nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
            for tagged_sent in tagged_sents))
    # Join phrases based on IOB syntax.
    candidates = [
        ' '.join(w[0] for w in group).lower()
        for key, group in itertools.groupby(all_chunks, lambda l: l[2] != 'O')
        if key
    ]

    # Filter by maximum keyphrase length
    candidates = list(filter(lambda l: len(l.split()) <= 3, candidates))

    # Filter phrases consisting of punctuation or stopwords
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    candidates = list(
        filter(
            lambda l: l not in stop_words and not all(c in punct for c in l),
            candidates))

    # lemmatize
    if lemmatize:
        lemmatizer = nltk.stem.WordNetLemmatizer().lemmatize
        candidates = [lemmatizer(x) for x in candidates]
    print(candidates)
    return candidates
def get_topic_tokens(data):
    stopwords = set(nltk.corpus.stopwords.words())
    tokens = nltk.word_tokenize(data)
    tokens = [t.lower() for t in tokens]
    stop_list = []
    for i in range(len(tokens)):
        if tokens[i] == '.':
            stop_list.append(i)
    sents_list = []
    for s in izip([0] + stop_list[:-1], stop_list):
        sents_list.append(tokens[s[0]:s[1]])
    tag_token_sents = nltk.pos_tag_sents(sents_list)
    nn_list = []
    for s in tag_token_sents:
        nns = [t[0] for t in s if t[0].isalpha() and t[1] == 'NN']
        nn_list.extend([t for t in nns if t not in stopwords and len(t) > 3])
    print nn_list[:30]
    return nn_list
Пример #24
0
def make_wordcloud(word_count):
    twitter = TweetTokenizer(strip_handles=True, reduce_len=True)
    # twitter = word_tokenize()

    sentences_tag = []
    # 정규표현식 특수문자 제거 후 형태소 분석하여 리스트에 넣기
    for sentence in title_list:
        wd = re.sub("[-=+,·#/\?:^$.@*\"※~&%ㆍ!’』\\‘|\(\)\[\]\<\>`\'…》]", '',
                    sentence)
        morph = twitter.tokenize(wd)
        sentences_tag.append(morph)
        print(morph)
        print('-' * 30)

    print(sentences_tag)
    print('\n' * 3)

    noun_adj_list = []

    # 명사와 형용사만 구분하여 리스트에 넣기
    tagged = nltk.pos_tag_sents(sentences_tag)
    for sentence1 in tagged:
        for word, tag in sentence1:
            if tag in ['NN', 'NNP', 'JJ']:
                print(word, tag)
                noun_adj_list.append(word)

    # 형태소별 count
    counts = Counter(noun_adj_list)
    tags = counts.most_common(word_count)
    print(tags)

    # wordCloud생성
    # 한글꺠지는 문제 해결하기위해 font_path 지정
    wc = WordCloud(font_path='/Library/Fonts/NanumGothic.ttf',
                   background_color='white',
                   width=800,
                   height=600)
    print(dict(tags))
    cloud = wc.generate_from_frequencies(dict(tags))
    plt.figure(figsize=(10, 8))
    plt.axis('off')
    plt.imshow(cloud)
    plt.show()
Пример #25
0
def extract_candidate_chunks(text,
                             grammar=r'NP: {<JJ>*<NN>}',
                             delimiter='_',
                             stemmer=None):

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))

    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(
        nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))

    all_chunks = list(
        itertools.chain.from_iterable(
            nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
            for tagged_sent in tagged_sents))

    # join constituent chunk words into a single chunked phrase

    if stemmer:
        stem = stemmer.stem
    else:
        stem = lambda x: x

    candidates = []
    for key, group in itertools.groupby(
            all_chunks,
            lambda word_pos_chunk_triple: word_pos_chunk_triple[2] != 'O'):
        if key:
            words = []
            for word, pos, chunk in group:
                try:
                    word = stem(word)
                except IndexError:
                    print("word unstemmable:", word)
                words.append(word)
            candidates.append(delimiter.join(words).lower())

    return [
        cand for cand in candidates
        if cand not in stop_words and not all(char in punct for char in cand)
    ]
Пример #26
0
def get_instances(xml_file, key_file):
    def should_be_omitted(num_instances):
        return num_instances < 5

    tree = ET.parse(xml_file)
    xml_instances = tree.getroot().findall('.//instance')

    ids = map(lambda x: x.attrib['id'], xml_instances)
    if should_be_omitted(len(ids)):
        return []

    heads = map(lambda x: x.find('context').text or '', xml_instances)
    # the part of the context behind the <head> doesn't get included in head
    # so we use the tail of the head to obtain it
    tails = map(lambda x: x.find('.//head').tail or '', xml_instances)

    full_context = [(head + tail).split() for head, tail in izip(heads, tails)]
    pos_tags_of_full_context = nltk.pos_tag_sents(full_context)

    with open(key_file) as labels_file:
        labels = [line.split(' ')[2] for line in labels_file]
        # labels is now the sense in wordnet, in the senseval format
        # but we should convert it into a numbered format, based on a key file (use SENSE_TO_INDEX)

    csv_instances = []
    for number, head, tail, label, pos_tags in izip(ids, heads, tails, labels,
                                                    pos_tags_of_full_context):
        head_pos_tags = pos_tags[:len(head.split())]
        head_pos_tags = map(lambda x: x[1], head_pos_tags)

        assert len(head_pos_tags) == len(head.split())

        tail_pos_tags = pos_tags[len(head.split()):]
        tail_pos_tags = map(lambda x: x[1], tail_pos_tags)

        assert len(tail_pos_tags) == len(tail.split())

        instance = Instance(number, head, tail, head_pos_tags, tail_pos_tags,
                            label)

        csv_instances.append(instance)

    return csv_instances
Пример #27
0
def pos_counts(text, pos_list):
    """Return the sorted list of distinct words with a given part of speech
    >>> emma = nltk.corpus.gutenberg.raw('austen-emma.txt')
    >>> pos_counts(emma, ['DET', 'NOUN'])
    [14352, 32029]
    """
    temp = []
    answer = []  # Arrays for count and final answer
    pos = [nltk.word_tokenize(s)
           for s in nltk.sent_tokenize(text)]  # tokenize the text
    tagged = nltk.pos_tag_sents(pos, tagset="universal")  # Tag each word
    for tag in tagged:
        for word in tag:
            temp.append(word[1])  # Select only the tagged part
    posCounter = Counter(temp)
    for p in pos_list:
        answer.append(
            posCounter[p])  #count the amount of PoS that is specified
    return answer
Пример #28
0
    def extract_candidate_chunks(self, text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
        import itertools, nltk, string

        nltk.data.path.append('/home/guanhua/sunhongyu/iGitRepo/project/other/nltk_data')
        # exclude candidates that are stop words or entirely punctuation
        punct = set(string.punctuation)
        stop_words = set(nltk.corpus.stopwords.words('english'))

        # tokenize, POS-tag, and chunk using regular expressions
        chunker = nltk.chunk.regexp.RegexpParser(grammar)
        sents = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)]
        tagged_sents = nltk.pos_tag_sents(sents, lang='eng')
        all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                        for tagged_sent in tagged_sents))

        # join constituent chunk words into a single chunked phrase
        candidates = [' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, lambda word__pos__chunk: word__pos__chunk[2] != 'O') if key]

        return [cand for cand in candidates if cand not in stop_words and not all(char in punct for char in cand)]
Пример #29
0
def extract_candidate_chunks(text, grammar=r'KT: {<NNP>+?}'):

    # exclude candidates that are stop words or entirely punctuation
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(
        nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(
        itertools.chain.from_iterable(
            nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
            for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [
        ' '.join(word for word, pos, chunk in group)
        for key, group in itertools.groupby(all_chunks, lambda x: x[2] != 'O')
        if key
    ]

    return candidates
Пример #30
0
def noun_phrases(text, *args, **kwargs):
    sentences = (nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))

    tagged_sentences = nltk.pos_tag_sents(sentences)

    # chunking baseado em https://www.nltk.org/book/ch07.html
    grammar = "NP: {<DT>?<JJ.*>*<NN.*>*<JJ.*>*}"

    cp = nltk.RegexpParser(grammar)

    NPs = []  # Sintagmas nominais como objetos Tree
    flat_noun_phrases = []  # Strings com os sintagmas nominais
    for sentence_tree in cp.parse_sents(tagged_sentences):
        for subtree in sentence_tree.subtrees():
            if subtree.label() == "NP":
                NPs.append(subtree)
                np = ' '.join(w[0] for w in subtree.leaves())
                flat_noun_phrases.append(np)
                yield np
def generate_candidate(texts, method='word', remove_punctuation=False):
    """
    Generate word candidate from given string

    Parameters
    ----------
    texts: str, input text string
    method: str, method to extract candidate words, either 'word' or 'phrase'

    Returns
    -------
    candidates: list, list of candidate words
    """
    words_ = list()
    candidates = list()

    # tokenize texts to list of sentences of words
    sentences = sent_tokenize(texts)
    for sentence in sentences:
        if remove_punctuation:
            sentence = punct_re.sub(' ', sentence) # remove punctuation
        words = word_tokenize(sentence)
        words = list(map(lambda s: s.lower(), words))
        words_.append(words)
    tagged_words = pos_tag_sents(words_) # POS tagging

    if method == 'word':
        tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS'])
        tagged_words = chain.from_iterable(tagged_words)
        for word, tag in tagged_words:
            if tag in tags and word.lower() not in stop_words:
                candidates.append(word)
    elif method == 'phrase':
        grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
        chunker = RegexpParser(grammar)
        all_tag = chain.from_iterable([tree2conlltags(chunker.parse(tag)) for tag in tagged_words])
        for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'):
            candidate = ' '.join([word for (word, pos, chunk) in group])
            if key is True and candidate not in stop_words:
                candidates.append(candidate)
    else:
        print("Use either 'word' or 'phrase' in method")
    return candidates
Пример #32
0
def extract_candidate_chunks(text, typ='title'):
    """
    extract candidate chunks from given text
    :param text: string: a single text
    :return: candidates: list, contain a series of candidate chunks
    """
    import nltk, itertools, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))

    # tokenize, POS-tag, and chunk using regular expressions
    if typ == 'title':
        grammar = "KT: {<JJ>* <NN.*>+}"
    else:
        grammar = "KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}"
    chunker = nltk.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(
        nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))

    # ==== method 1======
    candidates_with_POS = []
    candidates = []
    tree_chunked_sents = [
        chunker.parse(tagged_sent) for tagged_sent in tagged_sents
    ]
    for tree in tree_chunked_sents:
        for subtree in tree.subtrees():
            if subtree.label() == 'KT':
                candidates_with_POS.append(subtree.leaves())
    for cand in candidates_with_POS:
        NP = []
        for word, pos in cand:
            NP.append(word.lower())
        candidates.append(" ".join(NP))

    candidates = [
        candidate for candidate in candidates
        if candidate not in stop_words and not all(char in punct
                                                   for char in candidate)
    ]
    return candidates
Пример #33
0
def extract_candidate_chunks(text, chunker, stop_words, punctuation):
    # tokenize, POS-tag, and chunk using regular expressions
    tagged_sents = nltk.pos_tag_sents(
        nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = [
        nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
        for tagged_sent in tagged_sents
    ]
    all_chunks = list(itertools.chain.from_iterable(all_chunks))
    # join constituent chunk words into a single chunked phrase
    candidates = [
        '-'.join(word for word, pos, chunk in group).lower()
        for key, group in itertools.groupby(all_chunks, group_func) if key
    ]

    return set([
        cand for cand in candidates if cand not in stop_words and not all(
            char in punctuation for char in cand) and 2 < len(cand) < 15
    ])
def get_most_useful_pos_bigrams(count):
    """
    Computes the most frequently occurring POS tag bigrams.
    """
    filename = f'statistics/most_useful_pos_bigrams_{count}.json'

    if os.path.exists(filename):
        with open(filename) as _file:
            return [tuple(bigram) for bigram in json.load(_file)]

    bigram_counts_file = 'statistics/pos_bigram_counts.json'

    if os.path.exists(bigram_counts_file):
        with open(bigram_counts_file) as _file:
            counter = Counter(
                {tuple(key): value
                 for key, value in json.load(_file)})
    else:
        counter = Counter()

        for subreddit in tqdm(SUBREDDITS):
            with open(f'subreddits/{subreddit}.json') as _file:
                posts = json.load(_file)

            for post in tqdm(posts, desc=f'/r/{subreddit}'.ljust(20, ' ')):
                tag_sents = pos_tag_sents(sent_tokenize(post['content']))

                for sent in tag_sents:
                    for (token_a, pos_a), (token_b,
                                           pos_b) in zip(sent, sent[1:]):
                        counter[pos_a, pos_b] += 1

        with open(bigram_counts_file, 'w') as _file:
            json.dump(list(counter.items()), _file)

    results = [bigram for bigram, amount in counter.most_common(count)]
    results.sort()

    with open(filename, 'w') as _file:
        json.dump(results, _file)

    return results
Пример #35
0
def predict():

    result = request.form

    with open('tokens.pkl', 'rb') as f:
        tokens = load(f)

    with open('tags.pkl', 'rb') as f:
        tags = load(f)

    with open('sent_tokens.pkl', 'rb') as f:
        sent_tokens = load(f)

    text = str(result['texto'])
    text = re.sub(r'\.+', ".", text).split('.')
    text = [re.sub(r'[^\w\s]', '', x).strip() for x in text]
    text = [x.strip() for x in text if x.strip()]

    new_data = DataFrame(text, columns=['Sentença'])
    new_data['tag'] = pos_tag_sents(new_data['Sentença'].apply(word_tokenize).tolist(), lang='pt')
    counts = []
    for k in range(len(new_data)):
        counts.append(Counter([j for i, j in new_data['tag'][k]]))
    dmm = DataFrame(counts).fillna(0)
    for i in range(len(tags)):
        if tags[i] not in dmm:
            dmm[tags[i]] = 0

    vec = CountVectorizer(vocabulary=tokens)
    dtm = DataFrame(vec.fit_transform(text).toarray(), columns=vec.get_feature_names())

    sent = dtm.loc[:, sent_tokens].apply(sum, axis=1)

    with open('classifier.pkl', 'rb') as f:
        classifier = load(f)

    prediction = classifier.predict(concat([dtm, dmm, sent], axis=1))
    proportion = 100*sum(prediction == 'F')/len(prediction)

    new_data['classe'] = prediction

    return render_template('result.html', prediction=proportion, table=new_data.to_html())
Пример #36
0
    def extract_candidate_chunks(self):
        grammar = r'KT: { (<NN.*>+ <JJ.*>?)|(<JJ.*>? <NN.*>+)}'
        punct = set(string.punctuation)
        stop_words = set(nltk.corpus.stopwords.words('english'))
        chunker = nltk.chunk.regexp.RegexpParser(grammar)
        tagged_sents = nltk.pos_tag_sents(
            nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(self.text))
        all_chunks = list(
            itertools.chain.from_iterable(
                nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                for tagged_sent in tagged_sents))
        candidates = [
            ' '.join(word for word, pos, chunk in group).lower()
            for key, group in itertools.groupby(
                all_chunks,
                lambda word__pos__chunk: word__pos__chunk[2] != 'O') if key
        ]
        x = [
            cand for cand in candidates
            if cand not in stop_words and not all(char in punct
                                                  for char in cand)
        ]

        data = []

        for i in range(0, len(x), 1):
            if len(x[i].split()) == 1:
                if re.match("^[A-Za-z0-9]*$", x[i]):
                    if len(x[i]) > 2:
                        data.append(x[i])

            else:
                add = ""
                split = x[i].split()
                lenth = len(split)
                for i in range(0, lenth, 1):
                    king = re.match("^[A-Za-z0-9]*$", split[i])
                    if len(str(king)) > 2:
                        add = add + " " + split[i]
                data.append(add.strip())

        return data
Пример #37
0
def ner_recognize_string(string):
    sentences = nltk.sent_tokenize(string)
    tokenized_sentences = []
    for sentence in sentences:
        tokenized_sentences.append(nltk.word_tokenize(sentence))
    tagged_sentences = nltk.pos_tag_sents(tokenized_sentences)
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences)

    entities = set()

    for tree in chunked_sentences:
        for x in tree:
            if type(x) is tuple and "NNP" in x[1]:
                # print x[0]
                entities.add(x[0])
            elif type(x) is nltk.tree.Tree:
                # print string_from_tree(x)
                entities.add(string_from_tree(x))

    return entities
Пример #38
0
def getWordPattern(text):
    import itertools, nltk, string
    stop_words = set(nltk.corpus.stopwords.words('english'))

    punct = set(string.punctuation)
    tagged_words = itertools.chain.from_iterable(
        nltk.pos_tag_sents(
            nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)))

    # candidates = [[ tag, word.lower()] for word, tag in tagged_words
    #               if not all(char in punct for char in word)]
    candidates = [
        [tag, word.lower()] for word, tag in tagged_words
        if not all(char in punct
                   for char in word) and word.lower() not in stop_words
    ]

    res = pd.DataFrame.from_records(candidates, columns=['tag', 'word'])

    return res
Пример #39
0
def extract_candidate_words(text, minlen = 3, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
    import itertools, nltk, string, re

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize and POS-tag words
    tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
                                                                    for sent in nltk.sent_tokenize(text)))
    # filter on certain POS tags and lowercase all words
    candidates = [word.lower() for word, tag in tagged_words
                  if tag in good_tags and word.lower() not in stop_words
                  and not all(char in punct for char in word)]
    clean_candidates=[]
    for candidate in candidates:
        if re.search(r'[a-zA-Z0-9_]*\-*[a-zA-Z0-9_]*', candidate).group() == '': continue #remove wonky candidates
        if len(candidate)>=minlen:
            clean_candidates.append(candidate)
            
    return clean_candidates
Пример #40
0
def batch_tag_sentences(message_dict):
    """
    Uses a more efficient way of tagging all sentences for a given
    message at once.

    """
    num_sentences = [len(page['sentences']) for page in message_dict['urls']]
    all_sentences = [
        word_tokenize(s['s_clean']) for page in message_dict['urls']
        for s in page['sentences']
    ]
    all_tags = pos_tag_sents(all_sentences)

    for page_index, slice_length in enumerate(num_sentences):
        slice_start = sum(num_sentences[:page_index])
        slice_end = slice_start + slice_length
        for sentence_index, tags in enumerate(all_tags[slice_start:slice_end]):
            pos_tags = ['/'.join(b) for b in tags]
            message_dict['urls'][page_index]['sentences'][sentence_index][
                'pos_tags'] = ' '.join(pos_tags)
Пример #41
0
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    import itertools, nltk, string
    
    # ------ exclude candiates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    # print punct
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # print stop_words

    # print chunker
    sents = nltk.sent_tokenize(text)
    # print sents
    words = (nltk.word_tokenize(sent) for sent in sents)
    # print words
    tagged_sents = nltk.pos_tag_sents(words)
    # print tagged_sents

    # ------ tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    chunked_sents = (chunker.parse(tagged_sent) for tagged_sent in tagged_sents)
    # for chunked_sent in chunked_sents: print chunked_sent
    
    conll_tags = (nltk.chunk.tree2conlltags(chunked_sent) for chunked_sent in chunked_sents)
    # for conll_tag in conll_tags: print conll_tag 
    all_chunks = list(itertools.chain.from_iterable(conll_tags))
    print all_chunks

    # ------ join constituent chunk words into a single chunked phrase
    #for chunk in all_chunks:
    #    lambda(word, pos, chunk): chunk != 'O'
    #    print word

    #for key, group in itertools.groupby(all_chunks, lambda(word,pos,chunk): chunk != 'O'):
    #    #print key
    #    for word, pos, chunk in group:
    #        #print key, word, pos, chunk
    #        if key: 
    #            print ' '.join(word for word,pos,chunk in group).lower() 

    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
Пример #42
0
    def vectorize(self, path_to_json):
        file = open(path_to_json, "r")
        text = json.load(file)["text"]
        tokenized_sentences = nltk.sent_tokenize(text)
        tokenized_words = [
            nltk.word_tokenize(sent) for sent in tokenized_sentences
        ]
        pos_tagged_text = nltk.pos_tag_sents(tokenized_words)

        words = self.word_getter.get(text)

        tags = {}

        for sentence in pos_tagged_text:
            for tagged_word in sentence:
                if tagged_word[0] not in tags.keys():
                    tags[tagged_word[0]] = []
                tags[tagged_word[0]].append(tagged_word[1])

        result = {}
        for word in words:
            result[word] = [
                0, 0, 0, 0, 0
            ]  #Noun, Verb, Foreign Word, adjective, cardinal digit

        for word, tag_list in tags.items():
            if word not in result.keys():
                continue
            for tag in tag_list:

                if tag.startswith("N"):
                    result[word][0] = 1
                elif tag.startswith("V"):
                    result[word][1] = 1
                elif tag == "FW":
                    result[word][2] = 1
                elif tag.startswith("RB"):
                    result[word][3] = 1
                elif tag == "CD":
                    result[word][4] = 1
        return result
Пример #43
0
def select_candidates(text):
    grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'

    stop_words = set(nltk.corpus.stopwords.words('english'))

    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(
        nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))

    all_chunks = list(
        itertools.chain.from_iterable(
            nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
            for tagged_sent in tagged_sents))

    candidates = [
        ' '.join(word for word, pos, chunk in group).lower()
        for key, group in itertools.groupby(
            all_chunks, lambda wordposchunk: wordposchunk[2] != 'O') if key
    ]

    result = [
        cand for cand in candidates
        if cand not in stop_words and len(cand.split()) < 4
    ]

    #result = list(dict.fromkeys(result))

    #if len(result) < 10:
    #grammar = r'KT: {<DT>? <JJ>* (<NN>|<NP>|<PRN>)+}'

    #chunker = nltk.chunk.regexp.RegexpParser(grammar)

    #all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
    #for tagged_sent in tagged_sents))
    #candidates = [' '.join(word for word, pos, chunk in group).lower()
    #for key, group in itertools.groupby(all_chunks, lambda wordposchunk: wordposchunk[2] != 'O') if key]

    #result = [cand for cand in candidates
    #if cand not in stop_words and len(cand.split()) < 4 and not all(char in punct for char in cand)]

    return list(dict.fromkeys(result))
Пример #44
0
def create_phrase_vocabulary(raw_data):
    '''
	Extract vocabulary of nounphrase, because tfidfvectorizer only automatically extract ngram,
		if we want to use different format or different vocabulary, vocabulary must be created.
	'''

    #grammar to extract the noun phrase
    grammar = r'NP: {(<JJ.*>* <VBN>? <NN.*>+ <IN>)? <JJ.*>* <VBG>? <NN.*>+}'

    #set the punctuation and chunker
    punct = set(string.punctuation)
    chunker = RegexpParser(grammar)

    def lambda_unpack(f):
        #function to unpack the tuple
        return lambda args: f(*args)

    #tokenize and create pos tags per sentence, then get its IOB tag
    postag_sents = pos_tag_sents(word_tokenize(sent) for sent in raw_data)
    noun_phrases = list(
        chain.from_iterable(
            tree2conlltags(chunker.parse(tagged_sent))
            for tagged_sent in postag_sents))

    #join B-NP and I-NP tags as one noun phrase excluding O tags
    merged_nounphrase = [
        ' '.join(stemmer.stem(word) for word, pos, chunk in group).lower()
        for key, group in itertools.groupby(
            noun_phrases, lambda_unpack(lambda word, pos, chunk: chunk != 'O'))
        if key
    ]

    #filter the term below than two characters and punctuation
    all_nounphrases = [
        cand for cand in merged_nounphrase
        if len(cand) > 2 and not all(char in punct for char in cand)
    ]

    #select distinct noun phrases
    vocabulary = (list(set(all_nounphrases)))
    return vocabulary
Пример #45
0
def preprocess(text):
    """ Tag a english text with pos

    Args:
        text (str): The target text.

    Returns:
        pos_sents (list): A list of lists of tuples, containing each sentence with word-pos pairs.

    Examples:
    >>> text = "I am a loser. I don't have girlfriend."
    >>> preprocess(text)
    [
        [('I', 'PRP'), ('am', 'VBP'), ('a', 'DT'), ('loser', 'NN'), ('.', '.')],
        [('I', 'PRP'), ('do', 'VBP'), ("n't", 'RB'), ('have', 'VB'), ('girlfriend', 'NN'), ('.', '.')]
    ]
    """
    sentences = nltk.sent_tokenize(text)
    seg_sents = [nltk.word_tokenize(sent) for sent in sentences]
    pos_sents = nltk.pos_tag_sents(seg_sents)
    return pos_sents
Пример #46
0
def pos_counts(text, pos_list):
    """Return the sorted list of distinct words with a given part of speech
    >>> emma = nltk.corpus.gutenberg.raw('austen-emma.txt')
    >>> pos_counts(emma, ['DET', 'NOUN'])
    [14352, 32029]
    """

    result = []
    sentTokenized_text = nltk.sent_tokenize(text)
    tokenized_text = [nltk.word_tokenize(s) for s in sentTokenized_text]

    token_list = nltk.pos_tag_sents(tokenized_text, tagset="universal")

    pos_list_fd = nltk.FreqDist(
        [tag for tags in token_list for (word, tag) in tags])

    for word in pos_list:
        value = pos_list_fd[word]
        result.append(value)

    return []
Пример #47
0
def sentence_postag(reviewSentence):
    """
    형태소 분석 by sentence
    tokenize : nltk.word_tokenize + '.','/' 으로 분할
    """
    re_split = re.compile('[/.-]')
    tokenize = [nltk.word_tokenize(sent) for sent in reviewSentence]
    tokenize2 = []
    for sent in tokenize:
        sent_token = []
        for word in sent:
            if bool(re_split.search(word)):  # /, . 이 1개 이상 있으면 split
                token = re_split.split(word)
                sent_token.extend(token)
            else:
                sent_token.append(word)
        sent_token = [word for word in sent_token
                      if len(word) > 0]  #길이가 0인 문자열 제거
        tokenize2.append(sent_token)
    tagged = nltk.pos_tag_sents(tokenize2)
    return tagged
Пример #48
0
def analize_text(text: str, *, exact_words: bool = False) -> tuple:
    sentences = array(split_into_sentences(text, True))
    if (not len(sentences)):
        print("Nothing found")
        return []

    tags = pos_tag_sents(map(word_tokenize, sentences))

    if (not exact_words):
        lemmatized = lemmatize_sents(tags)
    else:
        lemmatized = tags

    chunker = RegexpParser("AC: {(<CD>?<TO|IN>?<CD>)+}\n "
                           "AN: {(<NPP>+<DT|NPP|JJ>*)+}\n "
                           "}<DT>+{\n "
                           "PH: {<[B-Z]+>+}\n "
                           "}<DT|CC|PRP|EX|WDT>+{")

    chunked = list(chunker.parse_sents(lemmatized))

    return (*setup_search_structure(chunked, tuple), sentences)
def get_chunks(sentences, grammar = r'NP: {<DT>? <JJ>* <NN.*>+}'):
    
    all_chunks = []
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    
    for sentence in sentences:
        
        tagged_sents = nltk.pos_tag_sents(
                            [nltk.word_tokenize(sentence)])
        
        chunks = [chunker.parse(tagged_sent) 
                  for tagged_sent in tagged_sents]
        
        wtc_sents = [nltk.chunk.tree2conlltags(chunk)
                     for chunk in chunks]    
         
        flattened_chunks = list(
                            itertools.chain.from_iterable(
                                wtc_sent for wtc_sent in wtc_sents)
                           )
        
        valid_chunks_tagged = [(status, [wtc for wtc in chunk]) 
                        for status, chunk 
                        in itertools.groupby(flattened_chunks, 
                                             lambda (word,pos,chunk): chunk != 'O')]
        
        valid_chunks = [' '.join(word.lower() 
                                for word, tag, chunk 
                                in wtc_group 
                                    if word.lower() 
                                        not in stopword_list) 
                                    for status, wtc_group 
                                    in valid_chunks_tagged
                                        if status]
                                            
        all_chunks.append(valid_chunks)
    
    return all_chunks
def pos_tag(docs):
    """
    Args:
        docs ([[[str]]])
    """
    return map(lambda doc: nltk.pos_tag_sents(doc), docs)
#get unique/total ratio
ratios = [(float(len(set(words)))/float(len(words))) for words in tokens]
plt.scatter(years, ratios)
plt.show()

#Collocations
lower = [[word.lower() for word in words] for words in tokens]
bigram_measures = nltk.collocations.BigramAssocMeasures()
for i in range(len(years)):
    finder = BigramCollocationFinder.from_words(lower[i])
    finder.apply_freq_filter(2)
    print (years[i], finder.nbest(bigram_measures.pmi, 10))

#chunk text and extract entities
postags = [nltk.pos_tag_sents(entry) for entry in senttokens]
ne_tags = [nltk.ne_chunk_sents(pos, binary=True) for pos in postags]
ents = [extract_entities(tagged) for tagged in ne_tags]
entFreqs = [nltk.FreqDist(entry) for entry in ents]

#get freq dist of all entities
allentities = [item for sublist in ents for item in sublist]
allentfreq = nltk.FreqDist(allentities)

#make list of top 50 most frequent and prune individual docs to take out filtered words
filtered, freq = zip(*allentfreq.most_common(50))
pruned = []
for entFreq in entFreqs:
    ents, freqs = zip(*entFreq.most_common(100))
    topEnts = [x for x in ents if x not in filtered]
    pruned.append(topEnts)
Пример #52
0
 def tag_many(self, documents, tagset=None, **kwargs):
     """ POS-Tag many documents. """
     return pos_tag_sents((word_tokenize(d) for d in documents), tagset)
Пример #53
0
import itertools
import nltk
import operator
import pandas as pd
import re

import functions as f

df = f.load_data()

## tokenize
sents = df['ingredient_txt'].map(lambda x: map(nltk.word_tokenize, x.split('\n')))
## remove first and last elements, which are empty lists.
sents = map(lambda x: x[1:-1], sents)

tagged = [nltk.pos_tag_sents(x) for x in sents[0:1000]]

## trying named entity recognition.
nltk.ne_chunk(tagged[0])


## trying hand-coded identification of ingredients.

def seq(pos, x): return [t[pos] for t in x]

## split tokens and tags into separate lists.
tok_seq = map(lambda x: map(lambda xx: seq(0, xx), x), tagged)
tag_seq = map(lambda x: map(lambda xx: seq(1, xx), x), tagged)

## create mapping between flattened list of ingredients and recipe ids.
idx = {}
Пример #54
0
#Each document will also disregard anything said by the interviewer. The logic 
#is that the interviewer's purpose is nothing more than to tease information out
#of the interview subject and thus would be repetitive or not data rich.

intervieweeTranscriptDict=dict()

with open(data_path) as data_file:
    csv_reader=csv.reader(data_file, delimiter=',',quotechar='"')
    for row in csv_reader:
        if row[1]=='Interviewee':
            if row[0] in intervieweeTranscriptDict:
                intervieweeTranscriptDict[row[0]].extend([nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(row[2])])
            else:
                intervieweeTranscriptDict[row[0]]=[nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(row[2])]

intervieweeTranscriptDict={interviewee: nltk.pos_tag_sents(doc) for interviewee,doc in intervieweeTranscriptDict.items()}

transcriptList=list()

for doc in intervieweeTranscriptDict.values():
    bufferDocList=list()
    stopwords=nltk.corpus.stopwords.words('english')
    for sentence in doc:
        for word, pos in sentence:
            posTest=nltk.tag.map_tag('en-ptb','universal',pos)
            if ((str.lower(word) not in stopwords and word not in set(string.punctuation)) and (posTest=='NOUN' or posTest=='VERB' or posTest=='ADJ' or posTest=='ADV')):
                bufferDocList.append(str.lower(word))
    transcriptList.append(bufferDocList)


#*************END PREPROCESSING
Пример #55
0
def get_corpus_pos(dic):
    return {interviewee: nltk.pos_tag_sents(doc) for interviewee,doc in dic.items()}
Пример #56
0
 def getPOSTags(self, article):
   articleSents = list(filter(bool, [line.lower().replace("<s>", "").replace("</s>", "").strip().split() for line in article.split("\n")]))
   postags = nltk.pos_tag_sents(articleSents)
   return postags