Пример #1
0
        def make_pos(target_tag, edit_rev):
            tags, srcs, dsts = edit_rev_triple

            # target_tag: 文中に存在する
            # 品詞を付与する前に、文中から削除・追加タグが存在する部分を取り除く
            if target_tag == del_tag:
                sentence = dsts
            elif target_tag == add_tag:
                sentence = srcs

            if target_tag in tags:
                tag_indexes = [i for i, x in enumerate(tags) if x == target_tag]
                trimed = sentence
                for tag_index in tag_indexes:
                    trimed = trimed[:tag_index] + trimed[tag_index+1:]

                posed = pos_tag(trimed)
                pos = [w[1] for w in posed]
                for tag_index in tag_indexes:
                    pos.insert(tag_index, u'')

                #debug
                None_indexes = [i for i, x in enumerate(pos) if x == u'']
                if tag_indexes != None_indexes:
                    print >>sys.stderr, tag_indexes
                    print >>sys.stderr, None_indexes
                    print >>sys.stderr, tags
                    print >>sys.stderr, pos
            else:
                posed = pos_tag(u' '.join(sentence).split())
                pos = [w[1] for w in posed]

            return pos
def number_of_exact_word_match(a, b, word_tokenizer, lemmatizer, stop_words):
    pos_a = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(a)))
    pos_b = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(b)))
    lemmae_a = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_a \
                    if token.lower().strip(punctuation) not in stop_words]
    lemmae_b = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_b \
                    if token.lower().strip(punctuation) not in stop_words]
    matched_words = set(lemmae_a).intersection(lemmae_b)
    return [len(matched_words), matched_words, b]
def number_of_noun_match(a, b, word_tokenizer, lemmatizer, stop_words):
    pos_a = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(a)))
    pos_b = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(b)))
    lemmae_a = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_a \
                    if pos == NOUN and token.lower().strip(punctuation) not in stop_words]
    lemmae_b = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_b \
                    if pos == NOUN and token.lower().strip(punctuation) not in stop_words]
    # Calculate Jaccard similarity
    #ratio = len(set(lemmae_a).intersection(lemmae_b)) / float(len(set(lemmae_a).union(lemmae_b)))
    #return (ratio > 0.66)
    matched_words = set(lemmae_a).intersection(lemmae_b)
    return [len(matched_words), matched_words, b]
 def _process_simpleHash(self, simpleHash):
     # Extract entities from keys resulting from SimpleExtractor process_*
     entityHash = {}
     for data in simpleHash:
         occs = simpleHash[data]['occurences']
         proxLoc = simpleHash[data]['proxLoc']
         # Tokenize sentences
         for sent in tokenize_sentences(data):
             # Tokenize words
             tokens = tokenize_words(sent)
             # Tag words with Parts of Speech
             tagged = pos_tag(tokens)
             # Identify named entities
             entities = ne_chunk(tagged)
             for ent in entities:
                 if isinstance(ent, NLTKParseTree):
                     # Is it a wanted type?
                     if ent.node in self.types:
                         # Should we keep the PoS tag?
                         if self.keepPos:
                             txts = ['/'.join(token) for token in ent.leaves()]
                         else:
                             txts = [token[0] for token in ent.leaves()]
                         txt = ' '.join(txts)
                         new = {txt: {'text': txt,
                                      'occurences': occs,
                                      'proxLoc': proxLoc[:]}}
                         entityHash = self._mergeHash(entityHash, new)
     return entityHash
Пример #5
0
def parse(body):
    
    contents = []
    if isinstance(body, basestring):
        contents.append(body)
    else:
        contents = body

    sentences = []
    for content in contents:
        sentences.extend([sentence for sentence in sent_tokenize(content) if not str_helper.hasHTMLTag(sentence)])
        
    stop = stopword.get_stopwords()
    tokens = {}

    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word not in stop and not str_helper.hasNumbers(word) and not str_helper.hasPunctuation(word):
                word = stem.stemming(word)
                tokens.setdefault(word, 0)
                tokens[word] += 1

    wp = pos_tag(tokens.keys())
    words = [row[0] for row in wp]
    tags = [row[1] for row in wp]

    return words, tags
def extract_onlynouns(tokens):
    out = list()
    for token in tokens:
        pos = pos_tag(nltk.word_tokenize(token.lower()))[0][1]
        if (pos == "NN") or (pos == "NNP"):
            out.append(token)
    return out
    def get_final_query_with_tokens(self, query):
        """ Get final query with keywords selected and processed 
        
            Parameters
            ----------
            
            query : string, mandatory
                The initial query 
        """
        
        result = {};
        result['query'] = query
        words = pos_tag(nltk.word_tokenize(query))
        #grammar = "NP: {<DT>?<JJ>*<NN>}"
        #cp = nltk.RegexpParser(grammar)
        #tree = cp.parse(words);
        #print(tree)
        #NPs = list(tree.subtrees(filter=lambda x: x.label()=='NP' or x.label().startswith('NN') or x.label()=='WP'))
        #print [' '.join(NP.leaves()[0]) for NP in NPs ]
        #print [(word, pos) for word,pos in words if pos.startsWith('NN') or pos == 'WP']
        result['select'] = self.get_select_tokens(words)
        
        # return query and required tokens
        return result
        
        
#qs = QueryTokenizer()
#inputs = ['s', 'what type of persons travels ?', 'who are travelling ?', 'what is the tour schedule ?', 'what is the cost for a tour?']
#for q in inputs:
#    print qs.get_final_query_with_tokens(q)
Пример #8
0
def main():

  wsjsubset = open("../corpus/wsjsubset", 'r').readlines()
  genia = open("../corpus/genia", 'r').readlines()

  words = []
  postag = []
  chunktag = []
  for line in wsjsubset:
    if len(line.split()) > 0:
      words.append(line.split()[0])
      postag.append(line.split()[1])
      chunktag.append(line.split()[2])
  
  postag_nltk = pos_tag(words) 
  hits = 0
  fails = {}
  for i in xrange(len(postag_nltk)):
    if postag_nltk[i][1] == postag[i]:
      hits += 1
    else:
      fails[(postag[i], postag_nltk[i][1])] = fails.get((postag[i], postag_nltk[i][1]), 0) + 1

  accuracy = hits/float(len(postag))
  for fail in fails:
    fails[fail] = fails[fail] / float(len(postag) - hits)

  for key, value in fails.iteritems():
    print value, key
def extract_entities(words):
    entities = []
    for chunk in ne_chunk(pos_tag(words)):
        if hasattr(chunk, 'node'):            
            performer = ' '.join(c[0] for c in chunk.leaves())
            entities.append(performer.lower())
    return entities
Пример #10
0
def tagging_ranks(theList):
    
    importantWords = []

    upTo = len(theList)
    for i in range(0, upTo):
        
        print theList[i][0]
        poS = tag.pos_tag(tokenize.word_tokenize(theList[i][0]))
        print poS
        theList[i].append(poS[0][1])
        
        if theList[i][1]<4:
            break
        
    print theList
    
    for words in theList:
        if len(words) > 2:
            if words[2].startswith(('JJ', 'NN', 'RB', 'VB')):
                importantWords.append(words)
    
    print importantWords
    return "Passed"
    
    pass
Пример #11
0
def process_raw_text(text):
    """
        First some code to standardize the formatting, then basic nlp.
    """
    # Remove breaks and tabs
    for char in ["\t", "\n"]:
        text = text.replace(char, " ")
    text = text.replace('."', '".')
    text = text.replace(".'", "'.")
    # Split special characters from words
    for char in ["'", '"', ",", ".", "?", "!", ";", ":"]:
        text = text.replace(char, " " + char + " ")
    # Magic to remove all multi-spaces
    text = ' '.join(text.split())

    # get the words, sentences, POS tags, and chunks.
    chunks = [ tuple([ c.type for c in t.chunks ]) for t in parsetree(text) ]
    sentences = sent_tokenize(text)
    sentences = [ word_tokenize(s) for s in sentences ]
    sentences_tags = [ tuple([ (w, simplify_tag(t)) for w, t in pos_tag(s) ]) for s in sentences ]

    sentences = [ tuple([ w for w, _ in s]) for s in sentences_tags ]
    tags = [ tuple([ t for _, t in s]) for s in sentences_tags ]
    words = flatten(sentences)

    return tuple(words), tuple(sentences), tuple(tags), tuple(chunks)
def title_permutations(title_expanded):
    title_tagged = pos_tag(title_expanded.split())
    st = PorterStemmer()
    title_pos = [st.stem(word) for word, pos in title_tagged if pos != 'IN']

    title_perms = list(map("*".join, permutations(title_pos)))
    return title_perms
def processoFeatures(resposta):
    frases = tokenizerFrases.tokenize(resposta["corpo"])
    palavras = []
    palavrasTexto = {}
    for frase in frases:
        palavrasTemp = tokenizerPalavras.tokenize(frase)
        for palavra in palavrasTemp:
            palavrasTexto[palavra] = True
    posTags = pos_tag(palavras)
    positivo = 0
    negativo = 0
    for palavra, tag in posTags:
        synsets = None
        if tag.startswith("J"):
            synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADJ)
        elif tag.startswith("V"):
            synsets = sentiwordnet.senti_synsets(palavra, wordnet.VERB)
        elif tag.startswith("N"):
            synsets = sentiwordnet.senti_synsets(palavra, wordnet.NOUN)
        elif tag.startswith("R"):
            synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADV)
        else:
            synsets = sentiwordnet.senti_synsets(palavra, "")
        if synsets != None:
            synsets = list(synsets)
            if len(synsets) > 0:
                synset = synsets[0]
                positivo = positivo + synset.pos_score()
                negativo = negativo + synset.neg_score()
    if positivo > negativo:
        return (palavrasTexto, "positivo")
    elif negativo > positivo:
        return (palavrasTexto, "negativo")
    else:
        return (palavrasTexto, "neutro")
def GetContractPage(x):
    url = 'http://www.defense.gov/contracts/contract.aspx?contractid=%d' % x
    html = urllib.urlopen(url).read()
    if re.search("The Official Home of the Department of Defense", html):
        return
    soup = BeautifulSoup(html)
    p_tags = soup.findAll("p")
    p_tags_text_list = [tag.text for tag in p_tags]

    tokenized_list = []

    for text in p_tags_text_list:
        tokenized_list = tokenize.word_tokenize(text)
        tokenized_list.append(nltk_tag.pos_tag(tokenized_list))
    tagged_list = tokenized_list[-1]

    data = {
        "url": url}

    for token in tagged_list[1:]:
        if token[1]=="NNP":
            data['entity'] = token[0]
            break
    

    for token in tagged_list[1:]:
        if token[1]=="CD":
            data['Amount'] = token[0]
            break

    print data
Пример #15
0
    def test_run():
        results = {}
        nouns = []
        product_list = {}
        for p in Post.query.all():
            tagged_sent = pos_tag(p.story.split())
            propernouns = [word for word,pos in tagged_sent if pos == 'NNP']
            for n in propernouns:
                if n == "I’m" or n == "It’s" or n == "Can’t":
                    continue
                results[n.replace('.', '')] = True

        for r in results.keys():
            nouns.append(r)

        for i in range(10):
            noun = random.choice(nouns)
            # print('Using "%s"', (noun,))
            for k in test_keywords:
                try:
                    products = amazon.search(Keywords=noun, SearchIndex=k)
                    for product in products:
                        product_list[product.title] = True
                except:
                    continue

        for p in product_list.keys():
            print("     Found title: %s" % (p,))
Пример #16
0
def count_words_unigram_pos(input_filename, output_path=''):

    txt = get_file_text(input_filename)

    word_regex = '[a-zA-Z]+'
    word_frequency = {}
    total_words = 0.

    matches = re.findall(word_regex, txt, re.M + re.S + re.U)
    for m in matches:
        word_frequency[m] = word_frequency.get(m, 0.) + 1.
        total_words+=1.

    sorted_words = sorted(word_frequency.iteritems(), key=operator.itemgetter(1))

    word_analysis = []
    for word in sorted_words:
        pos = pos_tag([word[0]])
        word_analysis.append([word[0], word[1], pos[0][1]])

    o_file = make_output_file(input_filename, output_path=output_path, prefix='', suffix='-words_unigram_pos')
    o_file.write('word\tcount\tpos\n')
    for w in word_analysis:
        o_file.write('%s\t%d\t%s\n' % (w[0], w[1], w[2]))

    o_file.close()
Пример #17
0
    def generate_searches(posts):
        results = {}
        nouns = []
        for p in posts:
            tagged_sent = pos_tag(p.story.split())
            propernouns = []
            last_noun = False
            for word,pos in tagged_sent:
                if pos == 'NNP':
                    if last_noun:
                        propernouns[-1] = propernouns[-1] + ' ' + word
                    else:
                        propernouns.append(word)
                        last_noun = True
                else:
                    last_noun = False

            for n in propernouns:
                if n == "I’m" or n == "It’s" or n == "Can’t":
                    continue
                results[n.replace('.', '')] = True

        for r in results.keys():
            nouns.append(r)

        return nouns
Пример #18
0
def lda_train(raw):
    stop = set(stopwords.words('english'))
    p_stemmer = PorterStemmer()
    text_array = []
    for i in range(len(raw)):
        text = raw[i].lower()
        text = text.replace('\r\n', ' ')
        text = re.sub("[^a-z0-9]", " ", text)
        # Tokenization segments a document into its atomic elements.
        words = text.split()
        # Stop words
        # Certain parts of English speech, like (for, or) or the word the are meaningless to a topic model.
        # These terms are called stop words and need to be removed from our token list.
        words = [j for j in words if j not in stop]
        tokenized = nltk.word_tokenize(text)
        tagged_sent = pos_tag(words)
        words = [word for word,pos in tagged_sent if pos == 'NN']
        # Stemming words is another common NLP technique to reduce topically similar words to their root.
        # stemming reduces those terms to stem. This is important for topic modeling, which would otherwise view those terms as separate entities and reduce their importance in the model.
        #words = [p_stemmer.stem(s) for s in words]
        text_array.append(words)
    dictionary = corpora.Dictionary(text_array)
    dictionary.save('dictionary.dic')
    corpus = [dictionary.doc2bow(text) for text in text_array]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    ldamodel = models.ldamodel.LdaModel(corpus, num_topics=15, id2word=dictionary, passes=20)
    filename = 'finalized_model_15.sav'
    joblib.dump(ldamodel, filename)
    print(ldamodel.print_topics(num_topics=15, num_words=6))
    return ldamodel,dictionary
Пример #19
0
def filter_by_pos(sentence, pos):
    """Returns the sentence with only words whose part of speech is in
    ACCEPTED_POS.
    """
    words_with_pos = pos_tag(word_tokenize(sentence))
    words_with_pos = filter(lambda word: word[1] in pos, words_with_pos)
    return ' '.join(map(lambda word_with_pos: word_with_pos[0], words_with_pos))
Пример #20
0
def keep_nouns(tf):
    n_tf = {}
    for k in tf:
        if pos_tag([k])[0][1].find('N') == 0:
            n_tf[k] = tf[k]

    return n_tf 
Пример #21
0
def extract_tags(comment_file):
    result = {} 
    
    fd = open(comment_file, "r")
    for s in fd:
        m = s.replace(",",".").replace("and", ".").replace("or",".").replace(":",".").split(".")
        for f in m:
            d = wordpunct_tokenize(f)
            for index, t in enumerate(d):
                pos_str = ""
                if t in cellphone_attribute:
                    before = index-10
                    if before < 0:
                        before = 0
                    end = index + 10
                    if end > len(d)-1:
                        end = len(d)-1
                    pos_result = pos_tag(d[before:end])
                    for pos_index, pos_sent in enumerate(pos_result):
                        seg_for_word = ""
                        adjust_word = ""
                        if pos_sent[1].find("JJ") != -1:
                            seg_for_word =  ' '.join(d[index:pos_index + before + 1])
                            adjust_word = pos_sent[0]
                            if pos_index+ before < index:
                                seg_for_word = ' '.join(d[pos_index+before:index+1])
                            add_into_dict(result, t, adjust_word, seg_for_word)
    return result
Пример #22
0
 def _get_nn(self, sentence):
     '''get proper nouns'''
     tagged_sent = pos_tag(sentence)
     propernouns = [word for word,pos in tagged_sent if pos == 'NN']
     regex = re.compile('[^a-zA-Z]')
     propernouns = [regex.sub('', i) for i in propernouns]
     return propernouns
Пример #23
0
def extractNounListFromTweetFile(filePath, fileName):
    fileTarget = open(filePath + "\\" + fileName, 'r')
    fileName = "processedTweets.txt"
    processedTweetsFileTarget = open(filePath + "\\" + fileName , 'w')
    allNouns = []

    i = 0
    for line in fileTarget:
        lineContent = line.split("\t")

        userId = lineContent[0]
        tweet = lineContent[1]
        #latitude = int(lineContent[2])
        #longitude = int(lineContent[3].strip("\n"))

        processedTweet, hashTags = preprocessTweet(tweet)
        taggedTweet = pos_tag(processedTweet.split())
        nounsInTweet = [word for word, pos in taggedTweet if pos == 'NNP' or pos == 'NN']

        allNouns.extend(nounsInTweet)

        processedTweetsFileTarget.write(str(i) + "\t" + userId + "\t" + processedTweet + "\t" + str(nounsInTweet) + "\t" + str(hashTags) + "\n")

        i += 1

        if i == 2000:
            break

    fileTarget.close()
    processedTweetsFileTarget.close()

    return allNouns
Пример #24
0
def extract(query):
    sentence = query
    tagged_sent = pos_tag(sentence.split())
    propernouns = [word for word,pos in tagged_sent if pos == 'NN']   
    return propernouns

#extract("I want to buy a car and a dog and plane")
def extract_pos(tokens, simple=True):
	"""
	Simple parts of speech of speech are:
	VERB - verbs (all tenses and modes)
	NOUN - nouns (common and proper)
	PRON - pronouns
	ADJ - adjectives
	ADV - adverbs
	ADP - adpositions (prepositions and postpositions)
	CONJ - conjunctions
	DET - determiners
	NUM - cardinal numbers
	PRT - particles or other function words
	X - other: foreign words, typos, abbreviations
	. - punctuation
	:param tokens:
	:return:
	"""
	tokens_pos = pos_tag(tokens)
	pos = [p for t, p in tokens_pos]
	if simple:
		# translate larger set of part of speech tags into small, simpler set
		pos_dict = nltk.tagset_mapping('en-ptb', 'universal')
		pos = [pos_dict[p] for p in pos]
	return pos
def analiseSentimento(resposta):
	texto = resposta['corpo']
	frases = sentencesTokenizer.tokenize(texto)
	palavras = []
	for frase in frases:
		palavras.extend(wordsTokenizer.tokenize(frase))
	posTags = pos_tag(palavras)
	positivo = 0
	negativo = 0
	for palavra, tag in posTags:
		synsets = None
		if tag.startswith('J'):
			synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADJ)
		elif tag.startswith('V'):
			synsets = sentiwordnet.senti_synsets(palavra, wordnet.VERB)
		elif tag.startswith('N'):
			synsets = sentiwordnet.senti_synsets(palavra, wordnet.NOUN)
		elif tag.startswith('R'):
			synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADV)
		else:
			synsets = sentiwordnet.senti_synsets(palavra, '')
		if synsets != None:
			synsets = list(synsets)
			if len(synsets) > 0:
				synset = synsets[0]
				positivo = positivo + synset.pos_score()
				negativo = negativo + synset.neg_score()
	if positivo > negativo:
		return (resposta, 'positivo')
	elif negativo > positivo:
		return (resposta, 'negativo')
	else:
		return (resposta, 'neutro')
	def __init__(self, sentence):
		assert type(sentence) == str
		assert len(sentence) > 0
		self.raw_text = sentence.strip()
		self.lower = sentence.lower()
		self.normalised = sentence.replace('"', '``').strip()
		self.tokens, self.postags = zip(*pos_tag(word_tokenize(sentence)))
Пример #28
0
def returnPOSTaggedWords(text):
    output={"CC":0,"CD":0,"DT":0,"EX":0,"FW":0,"IN":0,"JJ":0,"JJR":0,"JJS":0,"LS":0,"MD":0,"NN":0,"NNP":0,"NNPS":0,"NNS":0,"PDT":0,"POS":0,"PRP":0,"PRP$":0,"RB":0,"RBR":0,"RBS":0,"RP":0,"SYM":0,"TO":0,"UH":0,"VB":0,"VBD":0,"VBG":0,"VBN":0,"VBP":0,"VBZ":0,"WDT":0,"WP":0,"WP$":0,"WRB":0,"#":0,"$":0,"''":0,"(":0,")":0,",":0,".":0,":":0,"''":0,"-NONE-":0,"``":0}
    tokens=wordpunct_tokenize(text)
    tagged=pos_tag(tokens)

    for word,pos in tagged:
        output[pos]=output[pos]+1
    return output
def find_catalog(product_name,catalog_of_products):
	temp_catalog=list(catalog_of_products)
	tagged_text=pos_tag(product_name.split())
	output=nltk.ne_chunk(tagged_text)
	for subtree in output.subtrees(filter=lambda t: t.label() == 'PERSON'):
		for leave in subtree.leaves():
			temp_catalog.append(leave[0])
	return temp_catalog	
Пример #30
0
 def get_mistake_nouns(self):
     mistakes_id = get_all_mistakes_id()
     for id in mistakes_id:
         phrase = get_mistake_noun(id)
         tagged_sent = pos_tag(phrase.split())
         nouns = [word for word, pos in tagged_sent if pos[0] == 'N']
         for noun in nouns:
             yield noun.strip('.')
Пример #31
0
# In[288]:

ps = PorterStemmer()  #initialize Porter Stemmer object

ps_stems = []
for w in test_post_words:
    ps_stems.append(ps.stem(w))

print(' '.join(ps_stems))  # add all the stemmed words to one string

# In[293]:

#parts of speech tagging

token_tag = pos_tag(test_post_words)
token_tag[:10]

# In[294]:


def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
Пример #32
0
from nltk import word_tokenize
from nltk.tag import pos_tag
V = ['VB', 'VBZ', 'VBP', 'VBD', 'VBG']
N = ['NN', 'NNS', 'NNP', 'NNPS']
ADV = ['RB', 'RBR', 'RBS']
ADJ = ['JJ', 'JJR', 'JJS']
wLen = []       # number of words
vLen = []       # number of verbs
advLen = []     # number of adverbs
adjLen = []     # number of adjectives
vLen, nLen, advLen, adjLen, wLen = ([] for i in range(5))
for fileid in newcorpus.fileids():
 tokens = word_tokenize(newcorpus.raw(fileid))
 words = [t for t in tokens if t.isalpha()]
 taggedW = pos_tag(words)
 verbs, nouns, advs, adjs = ([] for i in range(4))
 for (w,tag) in taggedW:
     if tag in V: verbs.append(w)
     elif tag in N: nouns.append(w)
     elif tag in ADV: advs.append(w)
     elif tag in ADJ: adjs.append(w)
 wLen.append(len(words))
 vLen.append(len(verbs))
 nLen.append(len(nouns))
 advLen.append(len(advs))
 adjLen.append(len(adjs))


plotData0 = [(wLen, vLen), (wLen, nLen), (wLen, adjLen)]
yaxisLabels = ['V x 1000', 'N x 1000', 'ADJ x 1000']
# Named Entity Recognition

# Importing libraries
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

# Importing the data
dataset = """Abraham Lincoln was an American statesman and lawyer 
              who served as the 16th President of the United States"""

# Tokenization and POS tagging
dataset_tag = pos_tag(word_tokenize(dataset))

# Apply NER
dataset_ner = ne_chunk(dataset_tag)
print(dataset_ner)

# Tree Diagram
dataset_ner.draw()
Пример #34
0
# app = FlaskAPI(__name__)


f=open('about2.txt','r',errors = 'ignore')
raw=f.read()
# raw=raw.lower()
# nltk.download('punkt') # first-time use only
# nltk.download('wordnet') # first-time use only
# nltk.download('averaged_perceptron_tagger') #once

sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences 
word_tokens = nltk.word_tokenize(raw)# converts to list of words


propernouns = [word for word,pos in pos_tag(nltk.word_tokenize(raw)) if pos == 'NNP']
# print('propernouns')
# print(propernouns)
# print(word_tokens)

# new_word_tokens = []

# for w in word_tokens:
# 	if w in propernouns:
# 		new_word_tokens.append(w)
# 	else:
# 		new_word_tokens.append(w.lower())


# word_tokens = new_word_tokens
# print(word_tokens)
Пример #35
0
def _build_matrix(chunk=15):

    # daily_updown = DailyStock.objects.all().order_by("diff_yesterday")
    # daily_updown = daily_updown.values('year', 'month', 'date', 'diff_yesterday')
    #
    # minval = abs(min(map(itemgetter('diff_yesterday'), daily_updown)))
    # maxval = abs(max(map(itemgetter('diff_yesterday'), daily_updown)))
    # if minval > maxval:
    #     daily_updown = [dict(d, diff_yesterday=(
    #         d['diff_yesterday']) / minval) for d in daily_updown]
    # else:
    #     daily_updown = [dict(d, diff_yesterday=(
    #         d['diff_yesterday']) / maxval) for d in daily_updown]
    documents = []
    up_days = list(DailyStock.objects.order_by('diff_yesterday')[:chunk])
    down_days = list(DailyStock.objects.order_by('-diff_yesterday')[:chunk])

    max_diff = max(up_days[0].diff_yesterday, abs(down_days[0].diff_yesterday))
    daily_updown = up_days + down_days

    for daily in daily_updown:
        daily.diff_yesterday = daily.diff_yesterday / max_diff

    text_dict = {}
    days_text_list = []
    for daily in daily_updown:
        # dateconversion
        print(daily)
        LINK = 'https://news.google.com/rss/search?q=samsung+electronics+when:{}-{:02d}-{:02d}&hl=en-US&gl=US&ceid=US:en'.format(
            daily.year,
            daily.month,
            daily.date,
        )
        print(LINK)
        xmldoc = minidom.parse(urllib.request.urlopen(LINK, timeout=10))
        itemlist = xmldoc.getElementsByTagName('item')
        today_text = ''
        # for newsitem in itemlist:
        doccount = 0
        for items in itemlist:
            if doccount > 10:
                break
            print(items)
            singlelink = items.getElementsByTagName(
                'link')[0].firstChild.nodeValue
            pubdate = items.getElementsByTagName(
                'pubDate')[0].firstChild.nodeValue
            date = date_parser.parse(pubdate).strftime("%Y%m%d")
            try:
                response = get(singlelink, timeout=10)
            except Exception as e:
                print('here')
                pass
            else:
                try:
                    extractor = Goose()
                    article = extractor.extract(raw_html=response.content)
                    text_str = article.cleaned_text
                    today_text += text_str
                    doccount += 1
                except TypeError:
                    print('this')
                    pass
        days_text_list.append(today_text)
        DailyDocument.objects.create(doc=today_text,
                                     is_up=daily.diff_yesterday > 0)

    scripts = days_text_list
    lemmatizer = WordNetLemmatizer()

    for sen in range(0, len(scripts)):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(scripts[sen]))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        document = document.split()
        doc = pos_tag(document)
        final_doc = []

        for i in range(len(doc)):
            if doc[i][1] in ['NN', 'NNP', 'NNS', 'NNPS']:
                document[i] = lemmatizer.lemmatize(document[i], 'n')
                final_doc.append(document[i])

            elif doc[i][1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
                document[i] = lemmatizer.lemmatize(document[i], 'v')
                final_doc.append(document[i])

            elif doc[i][1] in ['JJ', 'JJR', 'JJS']:
                document[i] = lemmatizer.lemmatize(document[i], 'a')
                final_doc.append(document[i])

            elif doc[i][1] in ['RB', 'RBR', 'RBS', 'RP']:
                document[i] = lemmatizer.lemmatize(document[i], 'r')
                final_doc.append(document[i])

            #else:
            #    final_doc.append(document[i])

        pre_document = ' '.join(final_doc)
        documents.append(pre_document)

    vectorizer = TfidfVectorizer(stop_words='english',
                                 token_pattern=r'(?u)\b[A-Za-z]+\b',
                                 max_df=0.9)
    bag_of_words = vectorizer.fit_transform(documents)
    value_word = pd.DataFrame(bag_of_words.toarray()).mul(list(
        map(lambda d: d.diff_yesterday, daily_updown)),
                                                          axis=0).sum(axis=0)
    string_word = vectorizer.get_feature_names()
    word_dict = {
        string_word[i]: value_word[i]
        for i in range(len(string_word))
    }
    with open('word_dict_pickle', 'wb') as f:
        pickle.dump(word_dict, f)
    return word_dict
Пример #36
0
# print(node_ids)

edge_id = 0
for s,t,l in js:
    try:
        gjson.setdefault('edges',[]).append({'id': edge_id,
                                         'from_id': node_ids[s],
                                         'label': l,
                                         'trg_id': node_ids[t],
                                         'relxn': 'inferred'})
    except Exception as e:
        print(t,s,t)
        print(str(e))
    edge_id += 1


# print(
#     json.dumps(gjson, indent=2, sort_keys=False)
# )
# https://stackoverflow.com/questions/17966554/in-python-nltk-i-am-trying-to-get-parts-of-speech-of-a-word-by-using-pos-tag-bu
#https://becominghuman.ai/natural-language-processing-in-python-3-using-nltk-fd0ff4a0da9b

for t in all_nodes:
    gjson.setdefault('nodes', []).append({'id': node_ids[t],
                                          'term': t,
                                          'pos':  pos_tag([t])[0][1] })

print(
    json.dumps(gjson, indent=2, sort_keys=False)
)
Пример #37
0
da['targetDescription_subjectivity'] = data['targetDescription'].apply(lambda x : TextBlob(str(x)).sentiment.subjectivity)
da['targetParagraphs_polarity'] = data['targetParagraphs'].apply(lambda x : TextBlob(str(x)).sentiment.polarity)
da['targetParagraphs_subjectivity'] = data['targetParagraphs'].apply(lambda x : TextBlob(str(x)).sentiment.subjectivity)
da['targetTitle_polarity'] = data['targetTitle'].apply(lambda x : TextBlob(str(x)).sentiment.polarity)
da['targetTitle_subjectivity'] = data['targetTitle'].apply(lambda x : TextBlob(str(x)).sentiment.subjectivity)


# In[34]:


pos_di = {}
tagdict = load('help/tagsets/upenn_tagset.pickle')
for pos in list(tagdict.keys()):
	pos_di[pos] = []
for snt in data['postText']:
	di = Counter([j for i,j in pos_tag(word_tokenize(snt))])
	for pos in list(tagdict.keys()):
		pos_di[pos].append(di[pos])

da = pd.concat([da,pd.DataFrame(pos_di)], axis = 1)
#number of stop words
stp_wds = set(stopwords.words('english'))
da['postText_number_of_stop_words'] = data['postText'].apply(lambda x: len(stp_wds.intersection(word_tokenize(str(x)))))

#number of punctations
da['postText_num_of_unique_punctuations'] = data['postText'].apply(lambda x : len(set(x).intersection(set(string.punctuation))))


# In[35]:

Пример #38
0
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import tree2conlltags
from pprint import pprint

# Read text file
text = open(
    "Dataset/train.txt")  # change the path of train.txt / valid.txt / test.txt
text = text.read()

# Convert text to word
word_token = word_tokenize(text)

# performe POS Tagging
word_pos = pos_tag(word_token)

# Define pattern for POS
pattern = 'NP: {<DT>?<JJ>*<NN>}'

#Tag BOI for POS
cp = nltk.RegexpParser(pattern)
cs = cp.parse(word_pos)

#BOI Tagging on POS Tagging
iob_tagged = tree2conlltags(cs)

print(iob_tagged)

NER_List = []
Пример #39
0
def adapted_lesk(word, sentence, context_window_size=3, pos=None):
    """Performs word sense disambiguation using the Adapted Lesk Algorithm,
    due to Banerjee and Pedersen.

    Arguments:
        *word* (str) -- the target word to be disambiguated \n
        *sentence* (str) -- the context in which the target word occurs \n
        *context_window_size* (int) -- the number of words from the left and
        right of the target word to be taken into analysis \n
        *pos* (str) -- the part of speech of the target word

    Returns:
        Synset type -- the WordNet sense of the disambiguated word
    """

    # Tokenize input sentence, remove punctuation and stopwords
    sentence = utils.remove_stopwords(utils.remove_punctuation(w_tok(sentence)))

    # Perform lemmatization on sentence
    lemmatizer = WordNetLemmatizer()
    tagged_sentence = pos_tag(sentence)
    
    sentence = [lemmatizer.lemmatize(tup[0], utils.get_wordnet_pos(tup[1])) 
                for tup in tagged_sentence]
    
    # Perform lemmatization on target word
    if pos == None:
        tagged_word = pos_tag([word])
        word = lemmatizer.lemmatize(tagged_word[0][0], 
                                    utils.get_wordnet_pos(tagged_word[0][1]))
        pos = utils.get_wordnet_pos(tagged_word[0][1])
    else:
        word = lemmatizer.lemmatize(word, pos)
    
    # Extract the context window from the sentence
    if word in sentence:
        word_index = sentence.index(word)
        if word_index - context_window_size < 0:
            window_words = sentence[0 : word_index + context_window_size + 1]
        else:
            window_words = sentence[word_index - context_window_size : 
                                    word_index + context_window_size + 1]
    
        # Take the Synsets of the target word
        senses = wn.synsets(word)
        best_sense = senses[0]
        best_score = 0
        
        for sense in senses:
            if sense.pos() == pos:
                # Only take the current sense into account if it is the correct pos
                score = 0
        
                for w in window_words:
                    if w != word:
                        w_senses = wn.synsets(w)
        
                        for w_sense in w_senses:
                            score += similarity(sense, w_sense, pos)
        
                if score > best_score:
                    best_score = score
                    best_sense = sense
    else:  # If target word is not in context, after lemmatizing, return first wordnet sense
        f = open('logs/guessed.txt', 'a')
        line = "word: " + word + " in sentence: " + ' '.join(sentence)
        f.write(line + '\n')
        f.close()
        return wn.synsets(word)[0]
    
    return best_sense
Пример #40
0
"""
Created on Tue Apr 16 10:33:57 2019

@author: [email protected]
Information extraction
https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da
"""

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag

ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

sent = word_tokenize(ex)
sent = pos_tag(sent)

#we get tuples of words with their respective pos tags. But the funny tihng with pos tags is that
#they change for the same words depending on the occurance of the word in a sentence.

#Chunking to extract ners

"Lets create a chunk NP for extracting noun phrases"

pattern = "NP:{<DT>?<JJ>*<NN>}"

#lets test it

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)
Пример #41
0
def extract_NNPs(text):
    tagged = pos_tag(text.split())
    NNPs = [word for word, pos in tagged if pos == 'NNP']

    return NNPs
Пример #42
0
def factSequenceAnalysis(path):
    tree = ET.parse(path)
    root = tree.getroot()
    root = tree.getroot()
    yRange = 0
    #fil = open("output.txt",'w')
    result = []
    f = 0
    old_rev = []
    count_rev = 0
    total_facts = []
    #fil = open("output.txt","w");
    for rev in root.find(
            '{http://www.mediawiki.org/xml/export-0.10/}page').findall(
                '{http://www.mediawiki.org/xml/export-0.10/}revision'):
        count_rev += 1
        '''
         if(count_rev==3):
             break
         '''
        text = rev.find('{http://www.mediawiki.org/xml/export-0.10/}text').text
        if (not text):
            total_facts.append(0)
            continue
        tags = ["NNP", "NNPS"]

        tagged_sent = pos_tag(word_tokenize(text))
        #fil.write("TimeStamp:"+rev.find("{http://www.mediawiki.org/xml/export-0.10/}timestamp").text+"\n");
        current_rev = []
        #fil.write("TimeStamp:"+rev.find("{http://www.mediawiki.org/xml/export-0.10/}timestamp").text+"\n");
        if (f == 0):
            yRev = []
            count_y = 1
            for tagged_word in tagged_sent:
                if hasNumbers(tagged_word[0]) == False and hasPunctuations(
                        tagged_word[0]) == False and len(
                            tagged_word[0]
                        ) > 1:  #to remove words like ",","132" etc.
                    if (tagged_word[1] in tags):
                        #fil.write(tagged_word[0]+" , ")
                        old_rev.append(str(tagged_word[0]))
                        yRev.append(count_y)
                        count_y += 1
            #fil.write("\n=====================================================================\n")
            #print(old_rev)
            total_facts.append(len(old_rev))
            if (len(old_rev) > yRange):
                yRange = len(old_rev)
            result.append(yRev)
            f = 1
        else:

            for tagged_word in tagged_sent:
                if hasNumbers(tagged_word[0]) == False and hasPunctuations(
                        tagged_word[0]) == False and len(
                            tagged_word[0]
                        ) > 1:  #to remove words like ",","132" etc.
                    if (tagged_word[1] in tags):
                        #fil.write(tagged_word[0]+" , ")
                        current_rev.append(str(tagged_word[0]))

            new_list = find_diff(old_rev, current_rev)
            total_facts.append(len(current_rev))
            #print(current_rev)
            yRev = new_list[0]
            if (len(current_rev) > yRange):
                yRange = len(current_rev)
            result.append(yRev)
            old_rev = current_rev
        #print("one revision completed!!")
        #fil.write("\n=====================================================================\n")

    result.append(total_facts)
    result.append(count_rev)
    return result
Пример #43
0
    review += arr.strip()  #adds it to var review
    arr = fileReview.read()

fileReview.close()

print "Sentence tokenization..."
review_dict = sent_tokenize(review)  #tokenizes sentences
arr_pos = []
removed = []

print "POS tagging for words..."
#arr_sent = pos_tag(review_dict)		#tagging words for semantic
#annotation

for sent in review_dict:  #adding individual sentences after tagging
    arr_pos.extend([pos_tag(sent.split())].__iter__())

################################################################################
################################################################################

print "Loading Parser..."
#t = npc.parse(tmp_arr_pos[0])
print "Finished loading..."

#print len(t)
#t.draw()
#help(t)
sentCount = 1
sentScore = []  #tuple with (Subj-Obj , Verb-P , )
totalS = []
Пример #44
0
swlist = stopwords.words('english')
stemmer = PorterStemmer()

pos_corpus = []
neg_corpus = []
for f in pos:

    # match any special character and remove it (other than _)
    processed_content = re.sub(r'\W+', ' ', f.lower())

    # text into tokens
    words = word_tokenize(processed_content)

    # Attaching part of speech to each word
    pos_words = pos_tag(words)  #returns a list

    clean_words = []
    for w in pos_words:
        if w[0] in swlist or len(w[0]) <= 3 or w[1] not in (
                'JJ', 'JJR', 'JJS', 'NN', 'NNS', 'ADJ', 'ADV', 'VBN', 'VBG'):
            continue
        clean_words.append(stemmer.stem(w[0]))

    pos_content = ' '.join(clean_words)
    pos_corpus.append(pos_content)

for f in neg:
    processed_content = re.sub(r'\W+', ' ', f.lower())
    words = word_tokenize(processed_content)
    neg_words = pos_tag(words)
Пример #45
0
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

sentence = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))

print(ne_tree)

ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'


def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent


sent = preprocess(ex)
sent

pattern = 'NP: {<DT>?<JJ>*<NN>}'

import shanepy
import shanepy as spy
from shanepy import *

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
Пример #46
0
#Stemming
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer
print("\n\n)")
print("STEMMING")
for WORD in WORDS:
    stemmer = PorterStemmer()
    print(WORD, stemmer.stem(WORD))

#POS
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
print("\n\n")
print("POS-TAG")
for WORD in WORDS:
    print(WORD, pos_tag(WORD))

#LEMMATIZATION
from nltk.stem import WordNetLemmatizer
print("\n\n")
print("LEMMATIZATION")
lemmatizer = WordNetLemmatizer()
for WORD in WORDS:
    print(WORD, lemmatizer.lemmatize(WORD))

#TRIGRAM
from nltk import word_tokenize
from nltk.util import ngrams
print("\n\n")
print("TRIGRAM")
input_list = []