예제 #1
0
 def __init__(self, nfeatures=100000, doclen=60):
     self.grammar = r'KT: {(<RB.> <JJ.*>|<VB.*>|<RB.*>)|(<JJ> <NN.*>)}'
     # self.grammar = r'KT: {(<RB.*> <VB.>|<RB.>|<JJ.> <NN.*>)}'
     # self.grammar = r'KT: {<RB.>|<JJ.>}'
     self.chunker = RegexpParser(self.grammar)
     self.nfeatures = nfeatures
     self.doclen = doclen
예제 #2
0
    def build_vocabulary(self):
        """
        Generate a list of candidate phrases from the documents, using POS tagging and chunking
        functionality of nltk.
        """
        stop_words = set(stopwords.words('english'))

        vocabulary = []
        for doc in self.documents:
            words = []
            candidates = []
            clean_doc = text_cleaner(doc)
            sentences = sent_tokenize(clean_doc)
            words.extend([word_tokenize(sentence) for sentence in sentences])
            tagged_words = pos_tag_sents(words)

            grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
            chunker = RegexpParser(grammar)
            # split into a private function
            all_tag = chain.from_iterable(
                [tree2conlltags(chunker.parse(tag)) for tag in tagged_words])
            for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'):
                candidate = ' '.join([word for (word, pos, chunk) in group])
                if key is True and candidate not in stop_words:
                    candidates.append(candidate)
            vocabulary.append(candidates)

        vocabulary = list(chain(*vocabulary))
        vocabulary = list(np.unique(vocabulary))

        self.vocabulary = vocabulary
예제 #3
0
def extract_candidate_chunks(sents, grammar=GRAMMAR, tagged=False, **kwargs):
    """
    Extracts key chunks based on a grammar for a list of tokenized sentences.
    If the sentences are already tokenized and tagged, pass in: tagged=True
    """
    normalizer = Normalizer(**kwargs)
    chunker    = RegexpParser(grammar)

    for sent in sents:
        # Tokenize and tag sentences if necessary
        if not tagged:
            sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent))

        # Parse with the chunker if we have a tagged sentence
        if not sent: continue
        chunks = tree2conlltags(chunker.parse(sent))

        # Extract candidate phrases from our parsed chunks
        chunks = [
            " ".join(word for word, pos, chunk in group).lower()
            for key, group in groupby(
                chunks, lambda (word, pos, chunk): chunk != 'O'
            ) if key
        ]

        # Yield candidates that are not filtered by stopwords and punctuation.
        for chunk in normalizer.normalize(chunks):
            yield chunk
예제 #4
0
def extract_candidate_phrases(sents, grammar=GRAMMAR, tagged=False):

    # Create the chunker that uses our grammar
    chunker = RegexpParser(grammar)

    for sent in sents:
        # Tokenize and tag sentences if necessary
        if not tagged:
            sent = nltk.pos_tag(nltk.word_tokenize(sent))

        # Parse the sentence, converting the parse tree into a tagged sequence
        sent = normalize(sent)
        if not sent: continue
        chunks = tree2conlltags(chunker.parse(sent))

        # Extract phrases and rejoin them with space
        phrases = [
            " ".join(word for word, pos, chunk in group).lower()
            for key, group in groupby(
                chunks, lambda term: term[-1] != 'O'
            ) if key
        ]

        for phrase in phrases:
            yield phrase
예제 #5
0
def generate_candidate(texts, method='phrase', remove_punctuation=True):
    """
    Generate word candidate from given string

    Parameters
    ----------
    texts: str, input text string
    method: str, method to extract candidate words, either 'word' or 'phrase'

    Returns
    -------
    candidates: list, list of candidate words
    """
    words_ = list()
    candidates = list()

    # tokenize texts to list of sentences of words
    sentences = sent_tokenize(texts)
    for sentence in sentences:
        if remove_punctuation:
            sentence = punct_re.sub(' ', sentence)  # remove punctuation
            # sentence = re.sub(r'[^\w]', ' ', sentence)
        words = word_tokenize(sentence)
        words = list(map(lambda s: s.lower(), words))
        words_.append(words)
        tagged_words = pos_tag_sents(words_)  # POS tagging
        words_.clear()

        if method == 'word':
            tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS'])
            tagged_words = chain.from_iterable(tagged_words)
            for word, tag in tagged_words:
                if tag in tags and word.lower() not in stop_words:
                    candidates.append(word)
        elif method == 'phrase':
            # grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
            grammar = r'KT: {(<JJ><NN.*>)' \
                      r' | (<NN.*><NN.*>) ' \
                      r' | (<NN.*><NN.*><NN.*>) ' \
                      r'| (<JJ><JJ><NN.*>+)' \
                      r' | (<JJ><NN.*><NN.*>)' \
                      r' | (<NN.*><JJ><NN.*>) ' \
                      r'| (<NN.*><IN><NN.*>) ' \
                      r'| (<JJ><NN.*><IN><NN.*>) ' \
                      r'| (<NN.*><IN><JJ><NN.*>) ' \
                      r'| (<JJ><NN.*><IN><JJ><NN.*>) }'
            chunker = RegexpParser(grammar)
            all_tag = chain.from_iterable(
                [tree2conlltags(chunker.parse(tag)) for tag in tagged_words])
            for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'):
                candidate = ' '.join([word for (word, pos, chunk) in group])
                if key is True and candidate not in stop_words:
                    candidates.append(candidate)
        else:
            print("Use either 'word' or 'phrase' in method")

    return candidates
예제 #6
0
def buildchunkerlist(grammerlst, tagged):
    gtree = []
    for g in grammerlst:
        chunker = RegexpParser(g)
        OP = chunker.parse(tagged)
        if (OP.height() >= 3 ):
            gtree.append(OP.subtrees(lambda t: t.height() == 2))
            
    return gtree
예제 #7
0
def parseRelatedFeature(sent, tagged):
    
    chunker = RegexpParser(''' OP5: {<.*>+<NN>?<CD><.*>+<NN>?} ''')
    OP = chunker.parse(tagged)
    if (OP.height() >= 3 ):
        for m in OP.subtrees(lambda t: t.height() == 2):
            for (word,tag) in m:
                if ( tag == "NN" and r3.match(word)):
                    return True
예제 #8
0
def getConcepts(text):
    grammar = """
        CONCEPT:   {(<DT>)?(<JJ>)?<NN|NNS>+}
    """
    chunker = RegexpParser(grammar)
    taggedText = pos_tag(word_tokenize(text))
    textChunks = chunker.parse(taggedText)
    current_chunk = []
    for i in textChunks:
        if (type(i) == Tree and i.label() == "CONCEPT"):
            current_chunk.append(" ".join([token
                                           for token, pos in i.leaves()]))
    return current_chunk
def vocab_gen(texts, bool_key):
    list_word = []
    vocabs = []
    word_write = ""
    phrase_write = ""
    pos_write = ""
    sentences = sent_tokenize(texts)
    sentence_write = "\n".join(sentences)
    for sentence in sentences:
        words = word_tokenize(sentence)
        words = list(map(lambda s: s.lower(), words))
        list_word.append(words)
    words_w_pos = pos_tag_sents(list_word)  # POS
    dumb = [j for sub in words_w_pos for j in sub]
    dumb = pos_tag_sents(dumb)
    dumb = [j for sub in dumb for j in sub]
    for i in dumb:
        pos_write += str(i)
        pos_write += "\n"
    # define grammar to pull out the phrases
    grammar = r'KT: ' \
              r'{' \
              r'(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+' \
              r'}'
    grammar = RegexpParser(grammar)
    all_tag = chain.from_iterable(
        [tree2conlltags(grammar.parse(tag)) for tag in words_w_pos])
    for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'):
        vocabs_temp = ' '.join([word for (word, pos, chunk) in group])
        if bool_key == 'Phrase':
            if key is True and vocabs_temp not in stop_words and len(
                    vocabs_temp) > 2 and (' ' in vocabs_temp) == True:
                vocabs.append(vocabs_temp)
                phrase_write += vocabs_temp
                phrase_write += "\n"
        else:
            if key is True and vocabs_temp not in stop_words and len(
                    vocabs_temp) > 2 and (' ' in vocabs_temp) == False:
                vocabs.append(vocabs_temp)
                word_write += vocabs_temp
                word_write += "\n"
    update_file = open(vocabs_word_path, 'w')
    update_file.write(word_write)
    if bool_key == 'Phrase':
        update_file = open(vocabs_phrase_path, 'w')
        update_file.write(phrase_write)
    update_file = open(sentence_path, 'w')
    update_file.write(sentence_write)
    update_file = open(pos_path, 'w')
    update_file.write(pos_write)
    return vocabs
예제 #10
0
def extract_from_sentences(sentences, add_verbs=True, language="english"):
    """
    Processes Sentence objects to calculate contained Noun Phrases based on a given grammar and maps them to the
    sentences they occur in.

    :param sentences: A list of Sentence objects.
    :param add_verbs: Optional. Default: True. Whether or not verbs are to be added to the mapping.
    :param language: Optional. Default: English. The langue of the sentences.
    :return: A dictionary mapping tokens to the sentence IDs of the sentences they appear in.
    """
    # produce the mapping of sentences to their contained (words, pos) tuples
    pos_dictionary = {}
    NP_GRAMMAR_COMPOUND = "NP: {<JJ.*>*(<N.*>|<JJ.*>)+((<IN>|<TO>)?<JJ.*>*(<N.*>|<JJ.*>)+)*((<CC>|,)<JJ.*>*(<N.*>|<JJ.*>)+((<IN>|<TO>)?<JJ.*>*(<N.*>|<JJ.*>)+)*)*}"
    for sentence in sentences:
        pos_dictionary[sentence.sentence_id] = [
            (token, tag) for token, tag in sentence.tokens.items()
        ]
    parser_cmp = RegexpParser(NP_GRAMMAR_COMPOUND)
    term2sentence_id = {}
    lemmatizer = WordNetLemmatizer()
    for sentence_id, pos_tagged_tokens in pos_dictionary.items():
        if add_verbs:
            # updating the inverse occurrence index with verbs
            for subject, tag in pos_tagged_tokens:
                # check if subject is tagged as a verb
                if tag.startswith("VB"):
                    verb = lemmatizer.lemmatize(subject, "v").lower()
                    if verb not in stopwords.words(language):
                        if verb not in term2sentence_id:
                            term2sentence_id[verb] = set()
                        term2sentence_id[verb].add(sentence_id)
        # trying to parse the sentence_id into a top-level chunk tree
        tree = parser_cmp.parse(pos_dictionary[sentence_id])
        # getting the top-level tree triples and decomposing the NPs
        cmp_triples, simple_trees = get_cooccurence([tree],
                                                    ignore_stopwords=False,
                                                    language=language)
        smp_triples, _ = get_cooccurence(simple_trees,
                                         ignore_stopwords=True,
                                         language=language)
        # updating the inverse occurrence index with NPs
        for subject, _, objecT in cmp_triples + smp_triples:
            if subject.lower() not in term2sentence_id:
                term2sentence_id[subject.lower()] = set()
            if objecT.lower() not in term2sentence_id:
                term2sentence_id[objecT.lower()] = set()
            term2sentence_id[subject.lower()].add(sentence_id)
            term2sentence_id[objecT.lower()].add(sentence_id)
    return term2sentence_id
예제 #11
0
def get_tokens(text):
    word_list = []
    voc = []
    voc_write = ''
    sent = sent_tokenize(text)
    word_single = word_tokenize(text)
    if os.path.exists('token_log.txt'):
        k = open('token_log.txt', 'w', encoding='UTF8')
    else:
        k = open('token_log.txt', 'x', encoding='UTF8')
        k = open('token_log.txt', 'w', encoding='UTF8')
    k.write(str(word_single))
    for i in sent:
        word = word_tokenize(i)
        words = list(map(lambda s: s.lower(), word))
        word_list.append(words)
    words_pos = pos_tag_sents(word_list)

    if os.path.exists('pos_log.txt'):
        f = open('pos_log.txt', 'w', encoding='UTF8')
    else:
        f = open('pos_log.txt', 'x', encoding='UTF8')
        f = open('pos_log.txt', 'w', encoding='UTF8')
    f.write(str(words_pos))

    grammar = r'KT: ' \
              r'{' \
              r'(<JJ>* <NN.*>+ <In>)? <JJ>* <NN.*>+' \
              r'}'
    grammar = RegexpParser(grammar)

    tags = chain.from_iterable(
        [tree2conlltags(grammar.parse(tag)) for tag in words_pos])

    for key, group in groupby(tags, lambda tag: tag[2] != 'O'):
        voc_temp = ' '.join([word for (word, pos, chunk) in group])
        if key is True and voc_temp not in stopwords.words(
                'english') and voc_temp != 'https':
            voc.append(voc_temp)
            voc_write += voc_temp
            voc_write += '\n'
    if os.path.exists('voc_log.txt'):
        f = open('voc_log.txt', 'w', encoding='UTF8')
    else:
        f = open('voc_log.txt', 'x', encoding='UTF8')
        f = open('voc_log.txt', 'w', encoding='UTF8')
    f.write(voc_write)
    return voc
예제 #12
0
def getInstances(text):
    grammar = """
        PRE:   {<NNS|NNP|NN|NP|JJ|UH>+}
        MID: {<DT|IN|POS|FW|-|NP|NPS|NN|NNS>+}
        INSTANCE:   {(<DT+>)?(<JJ+>)?<PRE>(<MID><PRE>)?}
    """
    chunker = RegexpParser(grammar)
    taggedText = pos_tag(word_tokenize(text))
    textChunks = chunker.parse(taggedText)
    current_chunk = []
    for i in textChunks:
        if (type(i) == Tree and i.label() == "INSTANCE"):
            # print (i.leaves())
            current_chunk.append(" ".join([token
                                           for token, pos in i.leaves()]))
    return current_chunk
예제 #13
0
    def __init__(self, name, is_lazy, lazy_directory, debug, rule):
        """
    Constructor of the component.

    @param  name:           The name of the component.
    @type   name:           C{string}
    @param  is_lazy:        True if the component must load previous data, False
                            if data must be computed tought they have already
                            been computed.
    @type   is_lazy:        C{bool}
    @param  lazy_directory: The directory used to store previously computed
                            data.
    @type   lazy_directory: C{string}
    @param  debug:          True if the component is in debug mode, else False.
                            When the component is in debug mode, it will output
                            each step of its processing.
    @type   debug:          C{bool}
    @param  rule:           The rule to parse NP chunks. It is expressed with
                            POS tags.
    @type   rule:           C{string}
    """

        super(NPChunkExtractor, self).__init__(name, is_lazy, lazy_directory,
                                               debug)

        self.set_np_chunker(RegexpParser("NP: " + rule))
예제 #14
0
class KeyPhraseGenerator():
    """
    Extracts keyphrases from input list of strings.
    """
    def __init__(self, grammar=GRAMMAR, stopwords=STOPWORDS):

        self.chunker = RegexpParser(grammar)
        self.stopwords = stopwords

    def clean_text(self, txt):
        """
        Removes emoji and urls from text.
        """
        cleaned = cleaner.remove_emojis(txt)
        cleaned = cleaner.remove_urls(cleaned)
        return cleaned

    def clean_tagged_text(self, tagged_text):
        """
        Remove punctuation from tagged text.
        """
        punct_tagged = lambda word: all(
            unicat(char).startswith("P") and char != "," for char in word)
        cleaned = filter(lambda t: not punct_tagged(t[0]), tagged_text)
        return list(cleaned)

    def extract_keyphrases_single(self, txt):
        """
        Yields keyphrases for one piece of text.
        """
        for sent in txt:
            sent = self.clean_tagged_text(sent)
            if not sent:
                continue
            chunks = tree2conlltags(self.chunker.parse(sent))
            phrases = [
                " ".join(word for word, pos, chunk in group).lower()
                for key, group in groupby(chunks, lambda term: term[-1] != "O")
                if key
            ]
            for phrase in phrases:
                if phrase.lower() not in self.stopwords and len(phrase) > 2:
                    yield phrase

    def extract_keyphrases(self, txt_list):
        """
        Returns keyphrases for input list of strings.
        """
        key_docs = []
        for txt in txt_list:
            tagged_doc = []
            txt = self.clean_text(txt)
            for sent in nltk.sent_tokenize(txt):
                tagged_doc.append(nltk.pos_tag(nltk.word_tokenize(sent)))
            key_docs.append(list(self.extract_keyphrases_single(tagged_doc)))
        return key_docs
def generate_candidate(texts, method='word', remove_punctuation=False):
    """
    Generate word candidate from given string

    Parameters
    ----------
    texts: str, input text string
    method: str, method to extract candidate words, either 'word' or 'phrase'

    Returns
    -------
    candidates: list, list of candidate words
    """
    words_ = list()
    candidates = list()

    # tokenize texts to list of sentences of words
    sentences = sent_tokenize(texts)
    for sentence in sentences:
        if remove_punctuation:
            sentence = punct_re.sub(' ', sentence) # remove punctuation
        words = word_tokenize(sentence)
        words = list(map(lambda s: s.lower(), words))
        words_.append(words)
    tagged_words = pos_tag_sents(words_) # POS tagging

    if method == 'word':
        tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS'])
        tagged_words = chain.from_iterable(tagged_words)
        for word, tag in tagged_words:
            if tag in tags and word.lower() not in stop_words:
                candidates.append(word)
    elif method == 'phrase':
        grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
        chunker = RegexpParser(grammar)
        all_tag = chain.from_iterable([tree2conlltags(chunker.parse(tag)) for tag in tagged_words])
        for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'):
            candidate = ' '.join([word for (word, pos, chunk) in group])
            if key is True and candidate not in stop_words:
                candidates.append(candidate)
    else:
        print("Use either 'word' or 'phrase' in method")
    return candidates
예제 #16
0
def get_cooccurence(chunk_trees, ignore_stopwords=True, language="english"):
    """
    Parses a chunk tree and gets co-occurance of terms.

    :param chunk_trees: Tree from the NLTK RegexParser, generated over POS-tagged sentences using the provided grammar.
    :param ignore_stopwords: Optional. Default: True. Whether stopwords are to be ignored or not.
    :param language: Optional. Default: English. The language of the texts over which the chunk trees were generated.
    :return: A list of co-occuring tokens and a simple parse tree generated over the leaves of  the chunks of the
        provided one.
    """
    triples = []
    simple_trees = []
    lemmatizer = WordNetLemmatizer()
    NP_GRAMMAR_SIMPLE = "NP: {<JJ.*>*(<N.*>|<JJ.*>)+}"
    parser_simple = RegexpParser(NP_GRAMMAR_SIMPLE)
    for t in chunk_trees:
        entities = []
        for chunk in t:
            if isinstance(chunk, Tree) and chunk.label() == 'NP':
                # getting a tree for later processing of triples from the simple noun
                # phrases (if present)
                simple_trees.append(parser_simple.parse(chunk.leaves()))
                words = []
                for word, tag in chunk:
                    if (ignore_stopwords and word in stopwords.words(language)) or \
                            (not any(char.isalnum() for char in word)):
                        # do not process stopwords for simple trees, do not process purely
                        # non alphanumeric characters
                        continue
                    if tag.startswith('N'):
                        words.append(lemmatizer.lemmatize(word, 'n'))
                    elif tag.startswith('J'):
                        words.append(lemmatizer.lemmatize(word, 'a'))
                    else:
                        words.append(word)
                if len(words) > 0:
                    entities.append("_".join(words))
        for e1, e2 in combinations(entities, 2):
            triples.append((e1, "close to", e2))
            triples.append((e2, "close to", e1))
    return triples, simple_trees
예제 #17
0
def create_phrase_vocabulary(raw_data):
    '''
	Extract vocabulary of nounphrase, because tfidfvectorizer only automatically extract ngram,
		if we want to use different format or different vocabulary, vocabulary must be created.
	'''

    #grammar to extract the noun phrase
    grammar = r'NP: {(<JJ.*>* <VBN>? <NN.*>+ <IN>)? <JJ.*>* <VBG>? <NN.*>+}'

    #set the punctuation and chunker
    punct = set(string.punctuation)
    chunker = RegexpParser(grammar)

    def lambda_unpack(f):
        #function to unpack the tuple
        return lambda args: f(*args)

    #tokenize and create pos tags per sentence, then get its IOB tag
    postag_sents = pos_tag_sents(word_tokenize(sent) for sent in raw_data)
    noun_phrases = list(
        chain.from_iterable(
            tree2conlltags(chunker.parse(tagged_sent))
            for tagged_sent in postag_sents))

    #join B-NP and I-NP tags as one noun phrase excluding O tags
    merged_nounphrase = [
        ' '.join(stemmer.stem(word) for word, pos, chunk in group).lower()
        for key, group in itertools.groupby(
            noun_phrases, lambda_unpack(lambda word, pos, chunk: chunk != 'O'))
        if key
    ]

    #filter the term below than two characters and punctuation
    all_nounphrases = [
        cand for cand in merged_nounphrase
        if len(cand) > 2 and not all(char in punct for char in cand)
    ]

    #select distinct noun phrases
    vocabulary = (list(set(all_nounphrases)))
    return vocabulary
def chunk_location_sent(pos_text, temp_text):
	list_of_locs = list()

	chunk_grammar = r"""

	LOC:   {((<CD>?<NNP>+<CD>?)|(<CD>?<NN>+<CD>?))+}

	"""
	chunker = RegexpParser(chunk_grammar)


	chunked_article = chunker.parse(pos_text)
	for subtree in chunked_article.subtrees(): 
		if subtree.label()=='LOC':
			#print(' '.join((tuples[0] for tuples in list(subtree))))
			#print(subtree.pprint())
			NNPs = ' '.join((tuples[0] for tuples in list(subtree)))
			#print("LOC: " + NNPs)
			list_of_locs.append(NNPs)
	#print("loc list:", list_of_locs)
	return list_of_locs
    def getNounPhrases(self):

        featureSet = []

        # Handbook of NLP - Multiword Expressions, Timothy Baldwin and Su Nam Kim
        grammar = r"""
		    NBAR:
		    {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
		    NP:
		    {<NBAR>}
		    {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
		"""
        chunker = RegexpParser(grammar)

        for sentence in self.sentences:
            tokens = word_tokenize(sentence)

            if len(tokens) == 0:
                continue
            else:
                pass

            tagged = pos_tag(tokens)
            tree = chunker.parse(tagged)
            terms = []
            leafCollection = []

            for subtree in tree.subtrees(filter=lambda t: t.node == 'NP'):
                leafCollection.append(subtree.leaves())

            for leaf in leafCollection:
                term = [w for w, t in leaf if len(w) > 2]
                phrase = ' '.join(term)
                terms.append(phrase)

            featureSet += terms

        self.convertToFeatureDist(featureSet)
        self.helperObject.saveAllFeaturesExtracted(featureSet)
	def getNounPhrases(self):
		
		featureSet = []
		
		# Handbook of NLP - Multiword Expressions, Timothy Baldwin and Su Nam Kim
		grammar = r"""
		    NBAR:
		    {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
		    NP:
		    {<NBAR>}
		    {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
		"""
		chunker = RegexpParser(grammar)
	
		for sentence in self.sentences:
			tokens = word_tokenize(sentence)
			
			if len(tokens) == 0:
				continue
			else:
				pass
			
			tagged = pos_tag(tokens)
			tree = chunker.parse(tagged)
			terms = []
			leafCollection = []
			
			for subtree in tree.subtrees(filter = lambda t : t.node == 'NP'):
				leafCollection.append(subtree.leaves())
			
			for leaf in leafCollection:
				term = [w for w,t in leaf if len(w) > 2]
				phrase = ' '.join(term)
				terms.append(phrase)
			
			featureSet += terms
		
		self.convertToFeatureDist(featureSet)
		self.helperObject.saveAllFeaturesExtracted(featureSet)
예제 #21
0
class KeyphraseExtractor(BaseEstimator, TransformerMixin):
    """
    Wraps a PickledCorpusReader consisting of pos-tagged documents.
    """
    def __init__(self, grammar=GRAMMAR):
        self.grammar = GRAMMAR
        self.chunker = RegexpParser(self.grammar)

    def normalize(self, sent):
        """
        Removes punctuation from a tokenized/tagged sentence and
        lowercases words.
        """
        is_punct = lambda word: all(unicat(char).startswith('P') for char in word)
        sent = filter(lambda t: not is_punct(t[0]), sent)
        sent = list(sent)
        if len(sent) == 2:
            sent = map(lambda t: (t[0].lower(), t[1]), [sent])
            sent = list(sent)
        else:
            sent = list()
        return sent

    def extract_keyphrases(self, document):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Yields extracted phrases.
        """
        for sents in document:
            for sent in sents:
                sent = self.normalize(sent)
                if not sent: continue
                chunks = tree2conlltags(self.chunker.parse(sent))
                phrases = [
                    " ".join(word for word, pos, chunk in group).lower()
                    for key, group in groupby(
                        chunks, lambda term: term[-1] != 'O'
                    ) if key
                ]
                for phrase in phrases:
                    yield phrase

    def fit(self, documents, y=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield list(self.extract_keyphrases(document))
예제 #22
0
def extract_candidate_chunks(sents, grammar=GRAMMAR, tagged=False, **kwargs):
    """
    Extracts key chunks based on a grammar for a list of tokenized sentences.
    If the sentences are already tokenized and tagged, pass in: tagged=True
    """
    normalizer = Normalizer(**kwargs)
    chunker    = RegexpParser(grammar)

    for sent in sents:
        # Tokenize and tag sentences if necessary
        if not tagged:
            sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent))

        # Parse with the chunker if we have a tagged sentence
        if not sent: continue
        chunks = tree2conlltags(chunker.parse(sent))

        # Extract candidate phrases from our parsed chunks
        chunks = [
            " ".join(word for word, pos, chunk in group).lower()
            for key, group in groupby(
                chunks, lambda (word, pos, chunk): chunk != 'O'
            ) if key
        ]
def chunk_name_sent(pos_text, temp_text):
	list_of_names = list()

	chunk_grammar = r"""

	NAME: 	{<NNP>+}

	"""
	chunker = RegexpParser(chunk_grammar)


	chunked_article = chunker.parse(pos_text)
	#print("chunk:", chunked_article)
	for subtree in chunked_article.subtrees(): 
		if subtree.label()=='NAME':
			#print(' '.join((tuples[0] for tuples in list(subtree))))
			#print(subtree.pprint())
			NNPs = ' '.join((tuples[0] for tuples in list(subtree)))
			#print("..: ", NNPs)
			#print("LOC: " + NNPs)
			list_of_names.append(NNPs)

	#print("namelist: ", list_of_names)
	return list_of_names
예제 #24
0
def extract_words(nodetext, t2, doc, location):
	try:
	#	tokenizer = RegexT(r'\w*[a-zA-Z]\w*')
	#	return tokenizer.tokenize(nodetext)
	#except TypeError:
	#	return []
		grammar = "NP: {<JJ>*<NN>+}"
		phrases = []
		final_phrases = []
		for sent in sent_tokenize(nodetext):
			doc.add_sentence(Sentence(location, sent))
			tag_list = t2.tag(word_tokenize(sent))
			parser = RegexpParser(grammar)
			result = parser.parse(tag_list)
			for phrase in result:
				if isinstance(phrase, NLTREE.Tree) and phrase.node == "NP":
					phrases.append("_".join([word for word,pos in phrase.leaves()]))
					#n_phrase = "_".join([word for word,pos in phrase.leaves()])
					#if any(c.isdigit() for c in n_phrase):
				#		continue
				#	elif '.' in n_phrase:
				#		continue
				#	else:
				#		doc.add_word(Word(location, n_phrase, sent))

	except TypeError:
		return []
	for phrase in phrases:
		if any(c.isdigit() for c in phrase):
			continue
		elif '.' in phrase:
			continue
		else:
			final_phrases.append(phrase)

	return final_phrases
예제 #25
0
class KeyphraseExtractor(BaseEstimator, TransformerMixin):
    """
    Wraps a PickledCorpusReader consisting of pos-tagged documents.
    """
    def __init__(self, grammar=GRAMMAR):
        self.grammar = GRAMMAR
        self.chunker = RegexpParser(self.grammar)

    def normalize(self, sent):
        """
        Removes punctuation from a tokenized/tagged sentence and
        lowercases words.
        """
        is_punct = lambda word: all(unicat(char).startswith('P') for char in word)
        sent = filter(lambda t: not is_punct(t[0]), sent)
        sent = map(lambda t: (t[0].lower(), t[1]), sent)
        return list(sent)

    def extract_keyphrases(self, document):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Yields extracted phrases.
        """
        for sents in document:
            for sent in sents:
                sent = self.normalize(sent)
                if not sent: continue
                chunks = tree2conlltags(self.chunker.parse(sent))
                phrases = [
                    " ".join(word for word, pos, chunk in group).lower()
                    for key, group in groupby(
                        chunks, lambda term: term[-1] != 'O'
                    ) if key
                ]
                for phrase in phrases:
                    yield phrase

    def fit(self, documents, y=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield list(self.extract_keyphrases(document))
예제 #26
0
    def __init__(self, grammar=GRAMMAR, stopwords=STOPWORDS):

        self.chunker = RegexpParser(grammar)
        self.stopwords = stopwords
def apply_grammar(pos_words):
    grammar_parser = RegexpParser(GRAMMAR)
    return grammar_parser.parse(pos_words)
예제 #28
0
 def regex_chunk(self, tagged, pattern):
     pr = RegexpParser(pattern)
     chunked = [pr.parse(sent) for sent in tagged]
     return chunked
예제 #29
0
 def tagChunk(self, taggedword, loops=2):
     ## Cunking
     cp = RegexpParser(self.grammar, loop=loops)
     print('tagged word')
     print(taggedword)
     return cp.parse(taggedword)
예제 #30
0
ADJ_1: {<ADJ> <INTERJ|break>* <ADJ>+}
ADJ_1: {<ADJ>}
DET: {<NUM_ORD|NUM|PRON_POSS|EGEN_GEN|N_GEN>}
DET2: {<PRON_DEMO|PRON_PERS>}
DET3: {<PRON_UBST>}
NP: {<DET2|PRON_1|DET|DET3> <INTERJ|break>* <N> <INTERJ|break>* <ADJ_1>}
NP: {<DET2|PRON_1|DET|DET3> <INTERJ|break>* <ADJ_1|DET|DET3> <INTERJ|break>* <N>+}
NP: {<DET2|PRON_1|DET3|DET> <INTERJ|break>* <DET>* <INTERJ|break>* <ADJ_1|DET3>+ <N>*}
NP: {<DET2|PRON_1|DET|DET3> <INTERJ|break>* <N>+}
NP: {<ADJ_1|DET3> <INTERJ|break>* <N>}
NP: {<PRON_1>}
NP: {<DET2> <INTERJ|break>* <DET3>}
NP: {<PRON_INTER_REL|EGEN|N|DET2|DET3>}
"""

parser = RegexpParser(rules)

tokenized = word_tokenize('I am a bird')
tags = pos_tag(tokenized)


def parse_sentences(data):
    chunked_sentences = []
    for s in data:
        chunked = parser.parse(s)
        chunked_sentences.append(chunked)
    return chunked_sentences


def IOB(list):
    return [
예제 #31
0
class KeyphraseExtractor(BaseEstimator, TransformerMixin):
    """
    Extract adverbial and adjective phrases, and transform
    documents into lists of these keyphrases, with a total
    keyphrase lexicon limited by the nfeatures parameter
    and a document length limited/padded to doclen
    """
    def __init__(self, nfeatures=100000, doclen=60):
        self.grammar = r'KT: {(<RB.> <JJ.*>|<VB.*>|<RB.*>)|(<JJ> <NN.*>)}'
        # self.grammar = r'KT: {(<RB.*> <VB.>|<RB.>|<JJ.> <NN.*>)}'
        # self.grammar = r'KT: {<RB.>|<JJ.>}'
        self.chunker = RegexpParser(self.grammar)
        self.nfeatures = nfeatures
        self.doclen = doclen

    def normalize(self, sent):
        """
        Removes punctuation from a tokenized/tagged sentence and
        lowercases words.
        """
        is_punct = lambda word: all(unicat(c).startswith('P') for c in word)
        sent = filter(lambda t: not is_punct(t[0]), sent)
        sent = map(lambda t: (t[0].lower(), t[1]), sent)
        return list(sent)

    def extract_candidate_phrases(self, sents):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Extract phrases, rejoin with a space, and yield the document
        represented as a list of it's keyphrases.
        """
        for sent in sents:
            sent = self.normalize(sent)
            if not sent: continue
            chunks = tree2conlltags(self.chunker.parse(sent))
            phrases = [
                " ".join(word for word, pos, chunk in group).lower()
                for key, group in groupby(chunks, lambda term: term[-1] != 'O')
                if key
            ]
            for phrase in phrases:
                yield phrase

    def fit(self, documents, y=None):
        return self

    def get_lexicon(self, keydocs):
        """
        Build a lexicon of size nfeatures
        """
        keyphrases = [keyphrase for doc in keydocs for keyphrase in doc]
        fdist = FreqDist(keyphrases)
        counts = fdist.most_common(self.nfeatures)
        lexicon = [phrase for phrase, count in counts]
        return {phrase: idx + 1 for idx, phrase in enumerate(lexicon)}

    def clip(self, keydoc, lexicon):
        """
        Remove keyphrases from documents that aren't in the lexicon
        """
        return [
            lexicon[keyphrase] for keyphrase in keydoc
            if keyphrase in lexicon.keys()
        ]

    def transform(self, documents):
        docs = [list(self.extract_candidate_phrases(doc)) for doc in documents]
        lexicon = self.get_lexicon(docs)
        clipped = [list(self.clip(doc, lexicon)) for doc in docs]
        return sequence.pad_sequences(clipped, maxlen=self.doclen)
예제 #32
0
 def __init__(self, grammar=GRAMMAR):
     self.grammar = GRAMMAR
     self.chunker = RegexpParser(self.grammar)
예제 #33
0
pronounsent_nounDict = defaultdict(
    list
)  #key:tuple(pronoun,sentence_num) val:list(list(tuple(noun,pos)))     noun not normalized

grammar = """NP:{<DT>?<JJ>*(<NN.*>)+}    
               PR:{<PRP.*>}
            """

#grammar for tagging noun phrases and pronouns
#DT - determiners eg: The, a, an, my
#JJ - adjectives
#NN.* - any type of noun
#PRP - personal pronoun eg: He, she, I, We, they

rp = RegexpParser(grammar)
count = 0
for s in listOfTaggedSents:

    chunkedTree = ParentedTree.convert(
        rp.parse(s))  #tree of chunked parts of the sentence
    #ParentedTree is used to convert tagged words to tree structure
    neTree = ne_chunk(s)  #tree with named entity tags

    #print (chunkedTree)
    #chunkedTree.draw()
    #neTree.draw()

    for n in chunkedTree:
        if isinstance(n, nltk.tree.Tree):
            if n.label() == 'NP':
예제 #34
0
from nltk.corpus import wordnet as wn
from nltk.chunk import tree2conlltags
from nltk.probability import FreqDist
from nltk.chunk.regexp import RegexpParser
from unicodedata import category as unicat
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag

GRAMMAR = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
GOODTAGS = frozenset(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS'])
GOODLABELS = frozenset(['PERSON', 'ORGANIZATION', 'FACILITY', 'GPE', 'GSP'])

grammar = GRAMMAR
chunker = RegexpParser(grammar)
tweet_tokenizer = TweetTokenizer()
labels = GOODLABELS


def normalize(sent):
    """
    Removes punctuation from a tokenized/tagged sentence and
    lowercases words.
    """
    sent = tweet_tokenizer.tokenize(sent)
    sent = [x for x in sent if not 'http' in x]
    is_punct = lambda word: all(unicat(char).startswith('P') for char in word)
    sent = filter(lambda t: not is_punct(t[0]), sent)
    #     sent = map(lambda t: (t[0].lower(), t[1]), sent)
    sent = map(lambda t: t.lower(), sent)
예제 #35
0
path_to_jar_p = "/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-parser-full-2017-06-09/stanford-parser.jar"
path_to_models_jar_p = "/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar"
dependency_parser = StanfordDependencyParser(
    path_to_jar=path_to_jar_p, path_to_models_jar=path_to_models_jar_p)

from nltk.chunk.regexp import RegexpParser

grammar = '''
    NP: {<DT>? <JJ>* <NN>*} # NP
    P: {<IN>}           # Preposition
    V: {<V.*>}          # Verb
    PP: {<P> <NP>}      # PP -> P NP
    VP: {<V> <NP|PP>*}  # VP -> V (NP|PP)*
'''

reg_parser = RegexpParser(grammar)
parser = stanford.StanfordParser(
    model_path=
    "/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-parser-full-2017-06-09/lexparser.sh"
)
lmtzr = WordNetLemmatizer()

#def file_len(fname):
#    with open(fname) as f:
#        for i, l in enumerate(f):
#            pass
#        return i + 1
#********************** --Create read in fn-- ******************

#fname = "002-0.cex"
indir = '/Users/clairekelleher/Desktop/Thesis/Data/PItt_cookie_all_test'
예제 #36
0
def summarizer(tex, reduce_per):
    def norm(word, pos='x'):  #normalizes all words except proper nouns
        word = word.lower()
        if pos not in ['NNP', 'NNPS']:
            wnl = WordNetLemmatizer()
            word = wnl.lemmatize(word)
        return (word)

    sentList = sent_tokenize(tex)  #list of all tokenized sentences

    #print(sentList)

    sentNounDict = defaultdict(
        list
    )  # a dictionary key:sentence_number value:all nouns in the sentence... (nouns are normalised)

    for s in sentList:
        for w, pos in pos_tag(word_tokenize(s)):
            if pos in ['NN', 'NNS', 'NNP', 'NNPS']:
                sentNounDict[sentList.index(s)].append(norm(w, pos))
    #print (sentNounDict)

    wordSentDict = defaultdict(
        list
    )  # a dictionary key:(word,pos) value:all sentences it appears in...(word is normalised)

    for s in sentList:
        for w, pos in pos_tag(word_tokenize(s)):
            wordSentDict[(norm(w, pos), pos)].append(sentList.index(s))
    #print (wordSentDict)


#list of all nouns in the text
    listOfNouns = list(
        sorted(
            set([
                norm(w, pos) for s in sentList
                for w, pos in pos_tag(word_tokenize(s))
                if pos in ['NN', 'NNS', 'NNP', 'NNPS']
            ])))
    #print (listOfNouns)

    listOfTaggedSents = [
    ]  #list of sentences of tokenized words with postags- list[tuple(w,pos)]

    for s in sentList:
        l = [(n, pos) for n, pos in pos_tag(word_tokenize(s))]
        listOfTaggedSents.append(l)
    #print (listOfTaggedSents)

    mostSigNoun = []  #most recently encountered significant noun
    mostSigNounObject = [
    ]  #most recently encountered significant noun which is not a person
    mostSigNounPerson = [
    ]  #most recently encountered significant noun which has named entity as person

    pronounNounDict = defaultdict(
        list
    )  #key:touple(pronoun,sentence_num) val:list(list(touple(noun,pos)))(noun not normalized)

    #grammar for tagging noun phrases and pronouns
    grammar = """NP:{<DT>?<JJ>*(<NN.*>)+}    
                   PR:{<PRP.*>}
                """
    rp = RegexpParser(grammar)
    for s in listOfTaggedSents:
        begin = True
        chunkedTree = ParentedTree.convert(
            rp.parse(s))  #tree of chunked parts of the sentence
        neTree = ne_chunk(s)  #tree with named entity tags
        #print (chunkedTree)
        #chunkedTree.draw()
        for n in chunkedTree:
            if isinstance(n, nltk.tree.Tree):
                if n.label() == 'NP':
                    if begin == True:
                        mostSigNoun = [
                            w for w in n
                            if w[1] in ['NN', 'NNS', 'NNP', 'NNPS']
                        ]
                        #print (mostSigNoun)
                        for ne in neTree:
                            if isinstance(ne, nltk.tree.Tree):
                                if ne[0] in mostSigNoun:
                                    if ne.label() == 'PERSON':
                                        mostSigNounPerson = []
                                        mostSigNounPerson.append(ne[0])
                                    else:
                                        mostSigNounObject = []
                                        mostSigNounObject.append(ne[0])
                        begin = False

                if n.label() == 'PR':
                    pron = n[0][0].lower()
                    #print pron
                    if pron in ['it', 'its']:  #for objects
                        if len(mostSigNounObject) > 0:
                            pronounNounDict[(pron, listOfTaggedSents.index(s)
                                             )].append(mostSigNounObject)
                        else:  #if mostsignounobject does not exist
                            pronounNounDict[(pron, listOfTaggedSents.index(s)
                                             )].append(mostSigNoun)
                    else:
                        if len(mostSigNounPerson) > 0:
                            pronounNounDict[(pron, listOfTaggedSents.index(s)
                                             )].append(mostSigNounPerson)
                        else:
                            pronounNounDict[(pron, listOfTaggedSents.index(s)
                                             )].append(mostSigNoun)
                    begin = False
                    #print pronounNounDict

                    #adding the nouns corresponding to the pronouns to sentworddict and wordsentdict
                    for v1 in pronounNounDict[(pron,
                                               listOfTaggedSents.index(s))]:
                        for v11 in v1:  #it is a list of lists
                            sentNounDict[listOfTaggedSents.index(s)].append(
                                norm(v11[0], v11[1]))
                            wordSentDict[(norm(v11[0],
                                               v11[1]), v11[1])].append(
                                                   listOfTaggedSents.index(s))

    #print (sentNounDict)
    #print (wordSentDict)
    #print (pronounNounDict)

    for key, val in sentNounDict.items():  #making sentnoundict a set
        val = list(set(val))
        sentNounDict[key] = val
    #print (sentNounDict)

    #following code calculates the distance between two phrases
    distance = defaultdict(
        int
    )  #a dict.. key:(noun or noun(pronoun),sentence_num) value:position in the sentence from the begining

    for s in listOfTaggedSents:
        dist = 0
        chunkedTree = ParentedTree.convert(rp.parse(s))
        for n in chunkedTree:
            if isinstance(n, nltk.tree.Tree):
                if n.label() == 'NP':
                    tempNoun = [
                        w[0] for w in n
                        if w[1] in ['NN', 'NNS', 'NNP', 'NNPS']
                    ]
                    for w in tempNoun:
                        distance[(norm(w), listOfTaggedSents.index(s))] = dist
                if n.label() == 'PR':
                    pron = n[0][0].lower()
                    tempNoun = pronounNounDict[(pron,
                                                listOfTaggedSents.index(s))]
                    for v1 in tempNoun:
                        for v11 in v1:
                            distance[(norm(v11[0], v11[1]),
                                      listOfTaggedSents.index(s))] = dist
            dist += 1
    #print (distance)

    #the following code assigns relation factor between two nouns
    nounGraph = np.zeros((len(listOfNouns), len(listOfNouns)))

    for key, value in sentNounDict.items():
        for v1 in value:
            for v2 in value:
                d = 0
                if v2 != v1:
                    d = distance[v1, key] - distance[v2, key]
                    nounGraph[listOfNouns.index(v1)][listOfNouns.index(
                        v2)] += float((100 / (abs(d) + 1)))
                    #if nounGraph[listOfNouns.index(v1)][listOfNouns.index(v2)]>=100:
                    #print(v1+' '+v2+" "+str(d))

    #print(nounGraph)

    nounPriority = defaultdict(
        int
    )  #dict to hold noun priorities... key:noun(normalized)  value:priority
    sentencePriority = defaultdict(
        int
    )  #dict to hold sentence priorities...key:sentence_num   value:priority

    def calcNounPriority(
    ):  #function calculates the noun priority(sum of weights of all the edges attached to this noun in the noungraph)
        total = 0
        i = 0
        for x in nounGraph:
            total = sum(x)
            nounPriority[listOfNouns[i]] = total
            i += 1

    #print (sorted(nounPriority.items(),key=lambda x:x[1], reverse=True))

    def calcSentPriority(
    ):  #function calculates sentence priority(sum of priorities of all nouns in the sent)
        for key, value in sentNounDict.items():
            total = 0
            for n in value:
                total += nounPriority[n]
                sentencePriority[key] = total

    calcNounPriority()
    calcSentPriority()

    #print (sorted(sentencePriority.items(),key=lambda x:x[1], reverse=True))
    #for i in range(len(sentList)):
    #print(str(i)+' '+sentList[i])

    reducingFactor = 0.9  #10%
    summary = []  #list to hold the summary
    reduce_per = reduce_per / 100
    #print(reduce_per)
    for i in range(int(len(sentencePriority) * reduce_per)):
        summary.append(max(sentencePriority.items(), key=lambda x: x[1]))
        #print (summary)
        j = summary[-1][0]

        for n in sentNounDict[j]:
            nounPriority[
                n] *= reducingFactor  #reduce the priority of all nouns in the picked sentence

        del sentNounDict[j]
        del sentencePriority[j]  #remove the picked sentence
        calcSentPriority()  #recalculate sentence priority

    #print ("\n\n")
    i = 1
    s_list = []
    for s in sorted(summary):
        #print (i,sentList[s[0]])
        s_list.append(sentList[s[0]])
        i += 1

    return (s_list)
예제 #37
0
파일: chunk.py 프로젝트: detik19/BimaNLP
 def tagChunk(self, taggedword, loops=2):
     ## Cunking
     cp = RegexpParser(self.grammar, loop=loops)
     return cp.parse(taggedword)
'''------------------- POS Tagging   --------------------------------------------------------------'''
'''-----------------------------------------------------------------------------------------'''
from nltk.corpus import treebank
from nltk.tag import DefaultTagger, UnigramTagger

train_sents = treebank.tagged_sents()[:3000]

tagger1 = DefaultTagger('NN')
tagger2 = UnigramTagger(train_sents, backoff=tagger1)
'''-----------------------------------------------------------------------------------------'''
'''------------------- Chunking with POS Tagging ---------------------------------------------------'''
'''-----------------------------------------------------------------------------------------'''
chunker = RegexpParser(r'''
    NP:
        {<DT>?<NN.*><VB.*><DT.*>?<NN.*>}
        {<DT>?<NN.*><IN><DT><NN.*>}
        {<NN.*><VB.*><NN.*>}
        
    ''')

chunker2 = RegexpParser(r'''
    Phrase:
        {<JJ.*><NN.*>}
        {<RB><JJ>^<NN.*>}
        {<JJ><JJ>^<NN.*>}
        {<NN.*><JJ>^<NN.*>}
        {<RB.*><VB.*>}
    ''')


chunkerPOS = RegexpParser(r'''
예제 #39
0
 def __init__(self, grammar=GRAMMAR):
     self.grammar = GRAMMAR
     self.chunker = RegexpParser(self.grammar)
예제 #40
0
def makeParser(grammar=r"""
  NP: {<JJ.*>?<NN.*>+}
"""):
    return RegexpParser(grammar)