Exemplo n.º 1
0
def process_score_doc(kword, doc):
    f = io.open(doc, 'r', errors='ignore')
    s = f.read()

    candidate_sents = []

    orig_sentences = sent_tokenize(s)
    sentences = [word_tokenize(sent) for sent in orig_sentences]
    sentences = [pos_tag(sent) for sent in sentences]

    sent_index = 0
    for sent in sentences:
        NER_sent = ne_chunk(sent)
        iob_tags = tree2conlltags(NER_sent)
        for i in iob_tags:
            if i[2] in question_tag:
                # calculate combined TF-IDF
                combined_tf_idf = 0
                for k in question_keywords:
                    tf_s = tf_sent(k, sent)
                    idf_s = math.log(float(len(sentences) - tf_s + 0.5)/float(tf_s + 0.5))
                    tf_idf_s = tf_s*idf_s
                    combined_tf_idf += tf_idf_s
                candidate_sents.append((orig_sentences[sent_index], combined_tf_idf))

            else:
                pass
        sent_index += 1

    return list(set(candidate_sents))
Exemplo n.º 2
0
def entities2token(tokenized_sentence, name_token=None, gpe_token=None):
    # Check whether we do any NE replacement. Avoids building the tree in some cases
    if name_token is not None or gpe_token is not None:

        tagged = nltk.pos_tag(tokenized_sentence)
        # Tag the named entities
        ne_tagged = nltk.tree2conlltags(nltk.ne_chunk(tagged))

        # Replace names
        if name_token is not None:
            ne_tagged = [(name_token, tag,
                          ne_tag) if ne_tag.endswith('PERSON') else
                         (token, tag, ne_tag)
                         for (token, tag, ne_tag) in ne_tagged]
        # Replace geopolitical entities
        if gpe_token is not None:
            ne_tagged = [(gpe_token, tag,
                          ne_tag) if ne_tag.endswith('GPE') else
                         (token, tag, ne_tag)
                         for (token, tag, ne_tag) in ne_tagged]

        # Discard the NE tokens
        tagged = [(token, tag) for (token, tag, ne_tag) in ne_tagged]

        # Recollect the tokens
        tokens = [token for (token, tag) in tagged]
    else:
        tokens = tokenized_sentence
    # Convert them to lowercase
    tokens = [token.lower() for token in tokens]
    return tokens
Exemplo n.º 3
0
def Ext_Chunks(sents):
    NP_li = []
    # print(sents)
    grammar_exp = r"""
      CHUNK: {<NN><NN.*><NN.*>+}   # chunk determiner/possessive, adjectives and noun
             }<NNP>+{              # chunk sequences of proper nouns
    """
    # cp = nltk.RegexpParser('CHUNK:  {<NN><NN.*><NN.*>+}}<NNP>{')
    cp = nltk.RegexpParser(grammar_exp)
    # cp = nltk.RegexpParser('CHUNK:  {<DT>?<JJ.*>*<NN.*>+}')

    for sent in sents:
        tree = cp.parse(sent)
        # print(tree.draw())
        for subtree in tree.subtrees():
            if subtree.label() == 'CHUNK':
                print(subtree)
                iob_tags = tree2conlltags(subtree)
                iob_tree = conlltags2tree(iob_tags)
                print(iob_tags)
                print(iob_tree)
                chunk_words = str(subtree).replace('/DT', '').replace('/JJS', '').replace('/JJ', '').replace('/NNS',
                                                                                                             '').replace(
                    '/NNP', '').replace('(CHUNK', '').replace(')', '').replace('/NN', '').replace('\n', '')
                NP_li.append(chunk_words)
                print(chunk_words, '\n')
    print('----------------------------------------------------------------\n',NP_li)
    return NP_li
Exemplo n.º 4
0
def google_search(question):
    first_page = google.search(question, 1)
    #print first_page
    top_three_result = []
    i = 0
    while i < 5:
        top_three_result.append(first_page[i].description)
        i += 1

    first_search = ''.join(top_three_result).encode('ascii', 'replace')
    #print first_search

    ne_tree = (ne_chunk(pos_tag(word_tokenize(first_search))))

    iob_tagged = tree2conlltags(ne_tree)

    ss = [tuple(map(str, eachTuple)) for eachTuple in iob_tagged]
    question_type = classify_question(question)
    print 'question_type: ', question_type
    if question_type == 'None':
        ans = "Oops! I don't know."
    else:
        google_answer = []
        if question_type == 'Person':
            for i in range(len(ss)):
                if ss[i][2] == 'B-PERSON' or ss[i][2] == 'I-PERSON':
                    google_answer.append(ss[i][0])
        elif question_type == 'Country':
            print 'country identified'
            for i in range(len(ss)):
                if ss[i][2] == 'B-GPE' or ss[i][2] == 'I-GPE':
                    google_answer.append(ss[i][0])
        elif question_type == 'Location':
            for i in range(len(ss)):
                if ss[i][2] == 'B-LOCATION' or ss[i][2] == 'I-LOCATION':
                    google_answer.append(ss[i][0])
        elif question_type == 'Date':
            for i in range(len(ss)):
                if ss[i][2] == 'B-DATE' or ss[i][2] == 'I-DATE':
                    google_answer.append(ss[i][0])
        print 'google: ', google_answer
        if not google_answer:
            ans = "Oops, I don't know! "
        else:
            print 'inside else'
            counts = collections.Counter(google_answer)
            print 'counts: ', counts
            t = counts.most_common(4)
            candidate_answer = [seq[0] for seq in t]
            print candidate_answer
            #new_list = sorted(google_answer, key=lambda x: -counts[x])
            #print 'new_list',new_list
            #ans = ' '.join(new_list)
            for i in range(len(candidate_answer)):
                candidate_answer[i] = 'Candidate Answer ' + str(
                    i + 1) + ' ' + candidate_answer[i]
            candidate_answer = '\n'.join(candidate_answer)
            ans = candidate_answer
    return ans
Exemplo n.º 5
0
def money_ner(words_tagged):
    
    grammar = 'NumPhrase: {<CD|NNS><CD|NNS|JJ>}'
    t_parser = nltk.RegexpParser(grammar)
    final_tree = t_parser.parse(words_tagged)
    final_tags  = tree2conlltags(final_tree)
    
    return final_tags
 def to_dataset(cls, parsed_sentences, feature_detector):
     X,y = [],[]
     for parsed in parsed_sentences:
         iob_tagged = tree2conlltags(parsed)
         words, tags, iob_tags = zip(*iob_tagged)
         tagged = zip(words, tags)
         for index in range(len(iob_tagged)):
             X.append(feature_detector(tagged, index, history=iob_tags[:index]))
             y.append(iob_tags[index])
     return X,y
Exemplo n.º 7
0
def chunking_on_sentence(sentence):

    pattern = 'NP: {<DT>?<JJ>*<NN>}'
    parser = nltk.RegexpParser(pattern)
    parsed_sentence = parser.parse(sentence)

    bio_tagged_sentence = nltk.tree2conlltags(parsed_sentence)
    tree = nltk.ne_chunk(bio_tagged_sentence)

    return bio_tagged_sentence, tree
def recognize_ne(s):
    """
    Recognize named entities in given sentence.
    Use the NLTK package.
    :param s: String sentence to tag.
    :return: Tree structure of NE recognition.
    """
    ne_tree = nltk.ne_chunk(s, binary=False)
    iob_tags = nltk.tree2conlltags(ne_tree)
    return iob_tags
Exemplo n.º 9
0
 def getNamedEntities(self, text):
     ne_set = set()
     try:
         tree = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
         iob_tagged = nltk.tree2conlltags(tree)
         for obj in iob_tagged:
             if obj[1] == 'NNP' and len(obj[0]) > 3: ne_set.add(obj[0])
     except:
         print "error in NER"
     return ne_set
Exemplo n.º 10
0
def process_language(phrase):
    processedPhrase = list(my_bigram_tokens(nltk.word_tokenize(phrase)))
    processedPhrase = list(nltk.pos_tag(processedPhrase))
    #lemmas=list([lemma_with_default(T,mytagfixes) for T in processedPhrase])
    chunks = clean_chunks(list(nltk.tree2conlltags(cp.parse(processedPhrase))))
    processedPhrase = remove_stops_puncs(processedPhrase, stops)
    #lemmas=list([lemma_with_default(T,mytagfixes) for T in processedPhrase])
    #synons=list([get_synset(lemm) for lemm in lemmas])
    #return {'keywords':processedPhrase, 'lemmas':lemmas, 'synsets':synons}
    return processedPhrase, chunks
Exemplo n.º 11
0
 def ner_tag_text(self, text):
     """
     NER tag text
     :param text:
     :return: CONLL IOB format text
     """
     pickle_file = open(self.config_util.TRAIN_MODEL_PICKLE, 'rb')
     chunker_pickle = pickle.load(pickle_file)
     pickle_file.close()
     return tree2conlltags(
         chunker_pickle.parse(pos_tag(word_tokenize(text))))
Exemplo n.º 12
0
def calculateParameters(doc: str,
                        scores: Dict[str, float],
                        cands,
                        pr: Dict[str, float] = None):
    params = []

    max_cand_score = max(scores.values())
    all_cands = cands
    for cand in all_cands:

        freq = doc.count(cand)

        # pagerank_score = pr[cand]

        if cand not in scores:
            cand_score = 0.
        else:
            cand_score = scores[cand] / max_cand_score

        cand_len = len(cand)
        cand_term_count = len(cand.split())

        first_match = doc.find(cand) / len(doc)
        last_match = doc.rfind(cand) / len(doc)
        ne_cand = get_true_case(cand)
        words = nltk.pos_tag(nltk.word_tokenize(ne_cand))
        ne = nltk.tree2conlltags(nltk.ne_chunk(words))
        ne = [
            ' '.join(word for word, pos, chunk in group).lower()
            for key, group in itertools.groupby(ne, lambda tpl: tpl[2] != 'O')
            if key
        ]

        ne_cnt = len(ne[0].split()) if ne else 0

        if first_match == last_match:
            spread = 0.
        else:
            spread = last_match - first_match

        params.append([
            cand_score, cand_len, cand_term_count, first_match, 1 - last_match,
            ne_cnt
        ])  #, pagerank_score])  # , r[cand]])

    params = np.array(params)
    max_ = params.max(axis=0)
    params = np.divide(params,
                       max_,
                       out=np.zeros_like(params),
                       where=max_ != 0)
    return dict(zip(all_cands, params))
    def get_nltk_vectors(self, texts: List[str]):
        # https://gist.github.com/japerk/1909413
        from textblob import TextBlob
        sid = self.nltk_sid
        vsid = self.vader_sid
        pdict = self.pdict
        n_tokens_in = self.n_tokens_in
        rake = self.rake_nltk
        nltk_texts = [fasttext.tokenize(text) for text in texts]
        textblob_sentiments = [[sentiment.polarity, sentiment.subjectivity] for sentiment in [TextBlob(text).sentiment for text in texts]]
        textblob_sentiments = torch.tensor(textblob_sentiments).unsqueeze(1).expand(len(texts), n_tokens_in, 2)
        textblob_sentiments = textblob_sentiments.to(get_device())

        mask = stack_and_pad_tensors(list(map(lambda x: torch.ones(len(x), dtype=int), nltk_texts)), n_tokens_in)
        mask = mask.to(get_device())
        mask = self.is_mask_em(mask)
        has_digit = stack_and_pad_tensors(
            list(map(lambda x: torch.tensor([has_digits(str(t)) for t in x]), nltk_texts)), n_tokens_in)
        has_digit = has_digit.to(get_device())
        has_digit = self.has_digit_em(has_digit)

        m = self.text_model
        nltk_emb = stack_and_pad_tensors([torch.tensor([m[t] for t in sent]) for sent in nltk_texts], n_tokens_in) # if t in m else np.zeros(m.vector_size)
        nltk_emb = nltk_emb.to(get_device())
        sid_vec = torch.tensor([list(sid.polarity_scores(t).values()) for t in texts])
        sid_vec = sid_vec.unsqueeze(1).expand(len(texts), n_tokens_in, sid_vec.size(1))
        sid_vec = sid_vec.to(get_device())
        vsid_vec = torch.tensor([list(vsid.polarity_scores(t).values()) for t in texts])
        vsid_vec = vsid_vec.unsqueeze(1).expand(len(texts), n_tokens_in, vsid_vec.size(1))
        vsid_vec = vsid_vec.to(get_device())
        conlltags = [[ptags for ptags in nltk.tree2conlltags(ne_chunk(pos_tag(x)))] for x in nltk_texts]

        pos = stack_and_pad_tensors(
            list(map(lambda x: torch.tensor([pdict[tag.lower()] for token, tag, ne in x]), conlltags)), n_tokens_in)
        pos = pos.to(get_device())
        pos_emb = self.tag_em(pos)
        ner = stack_and_pad_tensors(
            list(map(lambda x: torch.tensor([pdict[ne.lower().split("-")[-1]] for token, tag, ne in x]), conlltags)), n_tokens_in)
        ner = ner.to(get_device())
        ner_emb = self.tag_em(ner)

        phrases = [get_rake_nltk_phrases(rake, t) for t in texts]

        key_wc_rake_nltk = [get_rake_nltk_wc(tokens, phr) for tokens, phr in zip(nltk_texts, phrases)]
        key_wc_rake_nltk = stack_and_pad_tensors(key_wc_rake_nltk, self.n_tokens_in)
        key_wc_rake_nltk = key_wc_rake_nltk.to(get_device())
        nltk_rake_vectors = self.key_wc_rake_nltk(key_wc_rake_nltk)

        result = torch.cat([vsid_vec, nltk_emb, textblob_sentiments, pos_emb, ner_emb, nltk_rake_vectors, sid_vec, mask, has_digit], 2)
        result = result.to(get_device())
        result = self.nltk_nn(result)
        return result
Exemplo n.º 14
0
    def __init__(self, chunked_sents, feature_detector, classifier_builder,
                 **kwargs):
        # Transform the trees in IOB annotated sentences [(word, pos, chunk), ...]
        chunked_sents = [tree2conlltags(sent) for sent in chunked_sents]

        chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents]

        self.feature_detector = feature_detector

        self.tagger = ClassifierBasedTaggerBatchTrained(
            train=(sent for sent in chunked_sents),
            feature_detector=self.feature_detector,
            classifier_builder=classifier_builder)
Exemplo n.º 15
0
 def get_chunktag(self, sentence):
     grammar = r"""
           NP: {<DT|JJ|P.*P.*|NN.*>+}
           PP: {<IN>+}
           VP: {<VB.*>+}
           ADVP: {<RB>+}
       """
     pos_sent = nltk.pos_tag(sentence)
     cp = nltk.RegexpParser(grammar)
     chunk_tree = cp.parse(pos_sent)
     chunk_tags = tree2conlltags(chunk_tree)
     chunk_tags = [ck[-1] for ck in chunk_tags]
     return chunk_tags
Exemplo n.º 16
0
    def evaluate(self, gold):
        # Convert nltk.Tree chunked sentences to (word, pos, iob) triplets
        chunked_sents = [tree2conlltags(sent) for sent in gold]

        # Convert (word, pos, iob) triplets to tagged tuples ((word, pos), iob)
        chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents]

        print(chunked_sents)

        dataset = self.tagger._todataset(chunked_sents)
        featuresets, tags = zip(*dataset)
        predicted_tags = self.tagger.classifier().classify_many(featuresets)
        return accuracy(tags, predicted_tags)
Exemplo n.º 17
0
    def checking_org(self, text):
        # First list of words, and cleaning from stopwords.
        words = word_tokenize(text)
        words = [w for w in words \
                    if w.lower() not in stopwords.words('english')]
        # Tagging the list.
        ptree = pos_tag(words)
        # FInally we simplify the tree and check if any word represents an
        # organization. This check can be definitvely inproved.
        for w in tree2conlltags(ne_chunk(ptree)):
            if (w[2][2:] == 'ORGANIZATION') and (w[1] == 'NNP'):
                return True

            return False
Exemplo n.º 18
0
def pos_tag_nltk(pos_tagger, sentence):

    tokens = word_tokenize(sentence)  # tokenization

    # pos_tagging | this gives us the (WORD,POS)
    pos_tags = pos_tagger.tag(tokens)

    # create the tree, the tree is necessary to do IOB tagging with tree2conlltags
    # so we need to convert post_tags to tree with ne_chunk
    tree = ne_chunk(pos_tags)

    # IOB tagging | this gives us (WORD,POS,TAG) with tree2conlltags
    iob_tags = tree2conlltags(tree)

    return iob_tags
Exemplo n.º 19
0
def tree2brackets(tree):
    str, tag = '', ''
    for item in tree2conlltags(tree):
        if item[2][0] in {'B', 'O'} and tag:
            str += tag + '] '
            tag = ''

        if item[2][0] == 'B':
            tag = item[2].split('-')[1]
            str += '['
        str += item[0] + ' '

    if tag:
        str += tag + '] '

    return str.strip()
Exemplo n.º 20
0
 def on_get(self, req, resp, id):
     print(id)
     arts = []
     arts_obj = ArticleModel.objects().all_fields().limit(10)
     for art in arts_obj:
         title = word_tokenize(art['title'])
         tagged = pos_tag(title)
         tree = ne_chunk(tagged)
         iob_tags = tree2conlltags(tree)
         print(str(art['_id']), tagged, iob_tags)
         arts.append({
             "_id": str(art["_id"]),
             "tag": tagged,
             "tree": iob_tags
         })
     resp.json = {"rslt": json.dumps(arts)}
Exemplo n.º 21
0
def calculateParameters(all_cands, doc, scores):
    params = []

    max_cand_score = max(scores.values())

    for cand in all_cands:

        freq = doc.count(cand)

        if cand not in scores:
            cand_score = 0.
        else:
            cand_score = scores[cand]  # / max_cand_score

        cand_len = len(cand)
        cand_term_count = len(cand.split())
        ne_cand = get_true_case(cand)
        words = nltk.pos_tag(nltk.word_tokenize(ne_cand))
        ne = nltk.tree2conlltags(nltk.ne_chunk(words))
        ne = [
            ' '.join(word for word, pos, chunk in group).lower()
            for key, group in itertools.groupby(ne, lambda tpl: tpl[2] != 'O')
            if key
        ]

        ne_cnt = len(ne[0].split()) if ne else 0

        first_match = doc.find(cand) / len(doc)
        last_match = doc.rfind(cand) / len(doc)

        # if cand_term_count == 1:
        #     cohesion = 0.
        # else:
        #     cohesion = cand_term_count * (1 + math.log(freq, 10)) * freq /

        if first_match == last_match:
            spread = 0.
        else:
            spread = last_match - first_match

        # print([cand_score, freq, cand_len, cand_term_count, first_match, last_match, spread, ne_cnt])

        params.append([
            cand_score, cand_len, cand_term_count, first_match, last_match,
            spread, ne_cnt
        ])  #cand_score,
    return params
Exemplo n.º 22
0
def tweet_ner_tagger(text_list, st, cp):
    text = [
        "URL" if word[0].startswith("http") else word[0] for word in text_list
    ]
    gold_tag = [word[1] for word in text_list]

    tokenized_text = text

    ner_taggers = st.tag(tokenized_text)
    pos_taggers = nltk.pos_tag(tokenized_text)
    chunk_taggers = tree2conlltags(cp.parse(pos_taggers))

    ner_sequence = [item[1] for item in ner_taggers]
    pos_sequence = [item[1] for item in pos_taggers]
    chunking_sequence = [item[1] for item in chunk_taggers]

    return text, gold_tag, ner_sequence, pos_sequence, chunking_sequence
Exemplo n.º 23
0
def get_iob(rl, name, book_analysis=False):
    tokens = list(
        filter(lambda token: token not in string.punctuation,
               word_tokenize(rl)))
    tagged_tokens = pos_tag(tokens)
    ner_tree = ne_chunk(tagged_tokens)
    iob_tagged = tree2conlltags(ner_tree)
    persons = list(filter(lambda x: "PERSON" in x[2], iob_tagged))
    tokens = list(map(lambda token: str(token).lower(), tokens))
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    stop = stopwords.words("english")
    no_stopwords = [item for item in lemmas if item not in stop]
    if book_analysis:
        print(f"{name} length: {len(rl)}")
        print(f"{name} persons: {len(persons)}")
        print(f"{name} tokens: {len(tokens)}")
    return persons, no_stopwords
Exemplo n.º 24
0
Arquivo: brain.py Projeto: 7ae/lucia
    def understand(self, sentence):
        # Break paragraph into sentences
        tokenized_sentence = sent_tokenize(sentence)

        # Break sentence into words
        for sent in tokenized_sentence:
            tokenized_word = word_tokenize(sent)

            # Tag corpora with universal POS tagset
            # For tag list, read https://www.nltk.org/book/ch05.html#tab-universal-tagset
            pos_tags = nltk.pos_tag(tokenized_word, tagset='universal')

            # Divide sentence into noun phrases with regular expression
            grammar = 'NOUN: {<DET>?<ADJ>*<NOUN>}'
            cp = nltk.RegexpParser(grammar)
            # Find chunk structure
            cs = cp.parse(pos_tags)
            # B-{tag} beginning, I-{tag} inside, O-{tag} outside
            iob_tags = np.asarray(tree2conlltags(cs)).tolist()

            # Recognize named entities
            doc = self.nlp(sent)

            # Parse word into numeral, ordinal, and time
            parse = lambda ne: dict([[
                _['dim'], _['value']['value']
            ] for _ in self.duckling.parse(
                ne, dim_filter=conf.get_property('duckling')['dimensions'])])
            # [Word, character positions and entity type]. For all entity types, read https://spacy.io/api/annotation#named-entities
            ne = list([
                ent.text, ent.start_char, ent.end_char, ent.label_,
                parse(ent.text)
            ] for ent in doc.ents)

            ne_tags = [_.ent_type_ for _ in doc]
            # Merge iob tags and named entity tags
            tagged_sent = [
                list(np.append(iob_tags[i], ne_tags[i]))
                for i in range(len(iob_tags))
            ]
            tagged_sent = ''.join(str(x) for x in tagged_sent)

            self.decide(tagged_sent, ne)
Exemplo n.º 25
0
    def to_dataset(self, parsed_sentences):
        """
    Transform a list of tagged sentences into a scikit-learn compatible POS dataset
    """

        X, y = [], []
        for parsed in parsed_sentences:
            iob_tagged = tree2conlltags(parsed)
            words, tags, iob_tags = zip(*iob_tagged)

            tagged = list(zip(words, tags))

            for index in range(len(iob_tagged)):
                X.append(
                    self._feature_detector(tagged,
                                           index,
                                           history=iob_tags[:index]))
                y.append(iob_tags[index])

        return X, y
def qa_generator(inputStr):
    orgininal_statement = inputStr
    tokens = nltk.word_tokenize(inputStr)
    tagged = nltk.pos_tag(tokens)
    copy = tagged

    #tree2conlltags gives a list of tuples, each tuple has the word, POS, and entity in that order
    entities = (nltk.tree2conlltags(nltk.ne_chunk(tagged)))
    print('ENTITIES BELOW')
    print(entities)

    #seperates the tuple into lists for the word and its entity
    words, tags, ent = zip(*entities)
    words = list(words)
    ent = list(ent)
    questions = list()
    i = 0
    while i <= len(entities) - 1:
        #get words with have pos B-PERSON or I-PERSON
        #Create a question that has both the first and last name? May not be super critical
        '''TAGS
        B-egin - first token of a multi-token entity
        I-n - inner token of a multi-token entity
        L-ast - Final token of a multi token entity
        U-nit - a single-token entity
        O-ut - a non-entity token
        '''

        if ent[i] == 'B-PERSON':
            questions.append("Who is " + words[i] + "?")
        elif ent[i] == 'I-PERSON':
            questions.append("Who is " + words[i] + "?")
        elif ent[i] == 'B-ORGANIZATION':
            questions.append("What is " + words[i] + "?")
            questions.append("Where is " + words[i] + "?")
            questions.append("What does " + words[i] + " do?")

        i = i + 1

    for i in questions:
        print(i)
Exemplo n.º 27
0
def ner_analyse(text, chunker):
    """
    extract human activity information from text(filted text with only time labeled sent )
    :param text: doc
    :return: list of tuple(activity elements)
    """
    sents = nltk.sent_tokenize(text)
    result = []
    for sent in sents:
        if not re.match('(.*\d\d\d\d.*)|(.*\d\ds*)', sent):
            continue
        entities = chunker.parse(pos_tag(word_tokenize(sent)))
        entities = nltk.tree2conlltags(entities)
        has_per = False
        has_loc = False
        has_org = False
        has_tim = False
        print('Analysing following sentence:\n{0}'.format(
            sent.encode('utf-8')))
        for entity in entities:
            # print('Etity[2] is \n{0}'.format(entity[2]))
            if entity[2] == 'B-per':
                has_per = True
            elif entity[2] == 'B-tim':
                has_tim = True
            elif entity[2] == 'B-loc':
                has_loc = True
            elif entity[2] == 'B-org':
                has_org = True
        if has_per and has_tim and (has_loc or has_org):
            # nltk.conlltags2tree(entities).draw()
            print('Yes!  This sentence has per tim and org|loc\n'
                  'Its entities are like:\n {0}'.format(entities))
            result.append(entities)
        else:
            print('No! This sentence does not meet our standard\n'
                  'Its entities are like:\n{0}'.format(entities))

    return result
Exemplo n.º 28
0
    def tree_forming(self):
        query = self.__dict__['query']

        q_tags = nltk.pos_tag(nltk.word_tokenize(query))

        par = nltk.RegexpParser('CHUNK: {<JJ>*<NN | NNS>*}')
        chunk = par.parse(q_tags)

        tree_q = nltk.tree2conlltags(chunk)
        langlist = []

        print(tree_q)
        for tup in tree_q:
            if tup[1] == 'VB':
                string = tup[0]
            elif tup[2] == "B-CHUNK" or tup[2] == "I-CHUNK":
                string += tup[0]
            else:
                continue
            langlist.append(string)
            string = ""

        print(langlist)
Exemplo n.º 29
0
def test_regex(frases, testmode):
    #Separamos en frases.
    frases = nltk.sent_tokenize(frases)
    #Tokenizamos.
    tokens = [nltk.word_tokenize(frase) for frase in frases]
    #Aplicamos el hidden tager
    tagged = [hmm_tagger.tag(token) for token in tokens]
    #Comida: Detecta nombres de comida simples. Nombres seguidos de un adjetivo (pollo asado). Detecta comida tipo "pincho de tortilla" o "pollo con tomate"
    #Cantidad: Detecta letras y números
    cp = nltk.RegexpParser('''
                           COMIDA: {(<ncms000>|<ncmp000>|<ncfs000>|<Fpt>)+(<aq0ms0|aq0fs0>)*<sps00>+(<ncms000>|<ncmp000>|<ncfs000>|<da0fs0>|<Fpt>)+}   
                           COMIDA: {(<ncms000>|<ncmp000>|<ncfs000>|<Fpt>)+(<aq0ms0|aq0fs0>)*}   
                           CANTIDAD: {(<di0ms0>|<dn0cp0>|<pi0ms000>|<di0fs0>|<Z>)+}
                           ''')
    #Aplicamos Regexparses sobre nuestros tokens tageados.
    for s in tagged:
        result = cp.parse(s)
        #result.draw()
        if testmode == True:
            diccionario = diccionario_regex(result)
            print(diccionario)
        iob_tags = tree2conlltags(result)

    return iob_tags
Exemplo n.º 30
0
path=r"/home/arushi/toi_news_articles"

len_art=[]
city_name=[]

for filename in os.listdir(path):
   print(filename)
   toi2=open(r"/home/arushi/toi_news_articles/"+filename,"r")
   data=toi2.read().replace('\n', '')
   #len_art.append(len(data.split()))
   words=word_tokenize(data)
  
   #print(nltk.pos_tag(words))
   
   tree=entities(data)
   iob_tags = tree2conlltags(tree)
   #print(iob_tags)
   
   for tup in iob_tags:
       if(tup[2]=="B-GPE" or tup[2]=="O_GPE" or tup[2]=="I-GPE"):
           city_name.append(tup[0])
   
   #print(tree)
   #tree.draw()
   
   
   
print(city_name)

import pandas as pd
df = pd.DataFrame(city_name, columns=["colummn"])