def get_concordance_polarity(main_df, uni_ents, lex_path):
    """
		Preparing list of entities for finding concordances and their polarity
        :param main_df: the article dataFrame
        :param uni_ents: entites id from articles
        :param lex_path: location of lexicon files
        :return: dataframe with 2 columns ent_id and conc_pol
	"""
    text = nltk.Text(main_df.lemma.tolist())
    c = nltk.ConcordanceIndex(text, key=lambda s: s.lower())
    entities = []
    for ent in uni_ents:
        #extract the words that correspond to the entity id
        word = list(set(main_df.loc[main_df['ent_id'] == ent, 'lemma']))
        temp = []
        for w in word:
            #only continue with words that are PROPN, ADJ or NOUN without duplicating
            if (main_df.loc[main_df['lemma'] == w, 'POS_tag'].head(1).item()
                    in ('PROPN', 'ADJ', 'NOUN')):
                if w.lower() not in temp:
                    temp.append(w.lower())
        entities.append(temp)
    dict_entities = dict(zip(uni_ents, entities))

    polarity_conc = pd.DataFrame(uni_ents, columns=['ent_id'])
    polarities = [
        polarity_concordances(ent_v, c, text, lex_path)
        for ent_k, ent_v in dict_entities.items()
    ]
    polarity_conc['conc_pol'] = pd.Series(polarities)

    return polarity_conc
    def getContext(self,phrase):
        phrase = phrase.lower()
        first_word = phrase.split(" ")[0]

        context = nltk.ConcordanceIndex(self.tokens)
        
        excerpt_padding = 6
                
        excerpts = []
        
        for i in context.offsets(first_word):

            start = max(0, i - excerpt_padding)
            end = min(len(self.tokens), i + excerpt_padding)

            excerpt = " ".join(self.tokens[start:end])


            if phrase in excerpt:
                i_phrase = excerpt.index(phrase)
                excerpt = excerpt[i_phrase - 30:i_phrase+30+len(phrase)]
                if len(excerpt)  < 10:
                    continue
                excerpts.append(excerpt)
        return excerpts
示例#3
0
    def n_concordance_tokenised(self,
                                text,
                                phrase,
                                left_margin=5,
                                right_margin=5):

        phraseList = phrase.split(' ')

        c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower())

        #Find the offset for each token in the phrase
        offsets = [c.offsets(x) for x in phraseList]
        offsets_norm = []
        #For each token in the phraselist, find the offsets and rebase them to the start of the phrase
        for i in range(len(phraseList)):
            offsets_norm.append([x - i for x in offsets[i]])
        intersects = set(offsets_norm[0]).intersection(*offsets_norm[1:])

        concordance_txt = ([
            text.tokens[list(
                map(lambda x: x - left_margin if
                    (x - left_margin) > 0 else 0, [offset]))[0]:offset +
                        len(phraseList) + right_margin]
            for offset in intersects
        ])

        outputs = [
            ''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt
        ]
        return outputs
示例#4
0
def n_concordance_tokenised(text, phrase, left_margin=5, right_margin=5):
    #concordance replication via https://simplypython.wordpress.com/2014/03/14/saving-output-of-nltk-text-concordance/

    phraseList = phrase.split(' ')

    c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower())

    #Find the offset for each token in the phrase
    offsets = [c.offsets(x) for x in phraseList]
    offsets_norm = []
    #For each token in the phraselist, find the offsets and rebase them to the start of the phrase
    for i in range(len(phraseList)):
        offsets_norm.append([x - i for x in offsets[i]])
    #We have found the offset of a phrase if the rebased values intersect
    #--
    # http://stackoverflow.com/a/3852792/454773
    #the intersection method takes an arbitrary amount of arguments
    #result = set(d[0]).intersection(*d[1:])
    #--
    intersects = set(offsets_norm[0]).intersection(*offsets_norm[1:])

    concordance_txt = ([
        text.
        tokens[map(lambda x: x - left_margin if
                   (x - left_margin) > 0 else 0, [offset])[0]:offset +
               len(phraseList) + right_margin] for offset in intersects
    ])

    outputs = [
        ''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt
    ]
    return outputs
def get_all_phases_containing_tar_wrd(target_word,
                                      tar_passage,
                                      left_margin=10,
                                      right_margin=10):
    """
        Function to get all the phases that contain the target word in a text/passage tar_passage.
        Workaround to save the output given by nltk Concordance function

        str target_word, str tar_passage int left_margin int right_margin --> list of str
        left_margin and right_margin allocate the number of words/pununciation before and after target word
        Left margin will take note of the beginning of the text
    """

    ## Create list of tokens using nltk function
    tokens = nltk.word_tokenize(tar_passage)
    print("Tokenns------------->", tokens)

    ## Create the text of tokens
    text = nltk.Text(tokens)
    print("Text----", text)
    ## Collect all the index or offset position of the target word
    c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower())

    ## Collect the range of the words that is within the target word by using text.tokens[start;end].
    ## The map function is use so that when the offset position - the target range < 0, it will be default to zero
    concordance_txt = ([
        text.tokens[map(lambda x: x - 5 if
                        (x - left_margin) > 0 else 0, [offset])[0]:offset +
                    right_margin] for offset in c.offsets(target_word)
    ])

    ## join the sentences for each of the target phrase and return it
    return [''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt]
def get_concordance(nltk_text, word, left_margin = 10, right_margin = 10):
    index = nltk.ConcordanceIndex(nltk_text.tokens, key = lambda s: s.lower())
    concordance_txt = ([nltk_text.tokens[list(map(lambda x: x-5 if (x-left_margin)>0 else 0,[offset]))[0]:offset+right_margin]
                        for offset in index.offsets(word)])
    output = [''.join([x+' ' for x in con_sub]) for con_sub in concordance_txt]
    outputFormatted = str(output).replace(",", ";") #making sure that concord. arent broken up by ',' in csv
    return outputFormatted
示例#7
0
def context(target_word, tar_passage, left_margin = 10, right_margin = 10):
    tokens = tokenize.word_tokenize(tar_passage)
    text = nltk.Text(tokens)
    c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())
    concordance_txt = [text.tokens[offset - left_margin : offset + right_margin]
                       for offset in c.offsets(target_word)]
    return [''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt]
示例#8
0
    def preprocess(self, text=None, stem=False, fix_pdf=True):

        if text is None:
            text = self.text

        def fix_pdf2txt(texto):
            import re
            texto = re.sub(r'\n([^A-Z])', r' \1', texto)
            texto = re.sub(r'([^\.])\n', r'\1.\n', texto)
            return texto

        def tokenizer_fr(text):
            # Courtesy of http://www.fabienpoulard.info/post/2008/03/05/Tokenisation-en-mots-avec-NLTK

            return tok_fr.tokenize(text)

        # Fix newline problems with pdf to txt step
        if fix_pdf:
            text = fix_pdf2txt(text)

        text = text.lower()

        # Tokenization
        self._original_tokens = tokenizer_fr(text)
        self._tokens = self._original_tokens

        #         self._tokens = [t for t in self._tokens if len(t) > 1]

        if stem:
            from nltk.stem.snowball import FrenchStemmer
            fr_stemmer = FrenchStemmer()
            self._tokens = [fr_stemmer.stem(t) for t in self._tokens]

        self._concordance_index = nltk.ConcordanceIndex(self._tokens,
                                                        key=lambda s: s)
示例#9
0
def get_concordance(word, textlist):
    """
    Print out the concordance of a word in a list of text
    """
    for text in textlist:
        tokens = nltk.word_tokenize(text)
        ci = nltk.ConcordanceIndex(tokens)
        if ci.offsets(word):
            ci.print_concordance(word)
def kwic(target_word, pessage, left_margin=5, right_margin=5):
    tokens = nltk.word_tokenize(pessage)
    text = nltk.Text(tokens)
    c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower())
    concordance_txt = ([
        text.tokens[map(lambda x: x - 5 if
                        (x - left_margin) > 0 else 0, [offset])[0]:offset +
                    right_margin] for offset in c.offsets(target_word)
    ])
    return [''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt]
def get_concordance(word, textlist):
    """
    Print out the concordance of a word in a list of text
    """
    for text in textlist:
        
        ph, tokens = get_tokens(text)
        phrases = get_phrases(ph)
        ci = nltk.ConcordanceIndex(phrases)
        if ci.offsets(word):
            ci.print_concordance(word)
    def __init__(self, text):
        self.corpus = text.lower()
        self.pos_tags = pos_tag(text, True)
        self.word_count = len(self.pos_tags)
        self.c_values = []  # form [(c-value, ngram)]
        self.nc_values = []  # form: [(ngram, nc-value)]
        self.candidate_cache = []
        self.context_words = defaultdict(lambda: [0, 0])
        self.conc_index = nltk.ConcordanceIndex(self.pos_tags)

        # maps from ("token", "pos-tag") to
        # (freq. as context word, no. of ngrams it appears with):
        self.weights = defaultdict(int)
示例#13
0
def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin = 10, right_margin = 10):
    
    tokens = nltk.word_tokenize(tar_passage)
     
    text = nltk.Text(tokens)
 
    c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())
 
   
    concordance_txt = ([text.tokens[list(map(lambda x: x-5 if (x-left_margin)>0 else 0,[offset]))[0]:offset+right_margin]
                        for offset in c.offsets(target_word)])
    
    return [''.join([x+' ' for x in con_sub]) for con_sub in concordance_txt]
示例#14
0
def findConcordanceText(target_word, raw, left_margin=10, right_margin=10):
    raw = re.sub(r'\W+', ' ', raw)
    tokens = nltk.word_tokenize(raw)
    text = nltk.Text(tokens)
    c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower())

    finalText = ""
    for offset in c.offsets(target_word):
        l = offset-10
        r = offset+10
        if offset-10 < 0:
            l=0
        line = ' '.join(tokens[l:r])
        finalText = finalText + ' ' + line
    return finalText
示例#15
0
def counterSearch(rObject):
	rType=0
	cType=0
	for o in rObject['nodes']:
		nid = o['nodeID']
		n = int(nid)
		ntype = o["type"]
		text = nltk.ConcordanceIndex(nltk.word_tokenize(o['text']))
		for w in RA:
			ra = w.lower()
			if text.offsets(ra):
				print("\n")
				text.print_concordance(ra,0,0)
				fromID.append(n)
				rType=1
				print('\nMatch on nodeID:',n,"type:",ntype,"with word:",ra)
				for e in rObject['edges']:#check the node connections from finished product
							eid = e['edgeID']
							etid = e['toID']
							efid = e['fromID']
							if etid == n or efid == n and ntype == "RA":
								print("\n !! RA confirmed on edge:", eid,efid,etid,ntype)
							es=int(eid)	
		for x in CA:
			ca = x.lower()
			if text.offsets(ca):
				text.print_concordance(ca)
				print("CA")
				print(ca)
				for e in rObject['edges']:#check the node connections from finished product
							eid = e['edgeID']
							etid = e['toID']
							efid = e['fromID']
							if nid == etid or nid == efid and ntype == "CA":
								print("\nCA confirmed on edge:", eid)								
	ns=int(nid)
	if rType == 1:
		nodeType="RA"
		topNode=nodeCount(nid,ns)	
		topEdge=edgeCount(es,eid)
		targetNode = createNode(topNode,nodeType)
		for x in fromID:
			topEdge=createEdge(topEdge,x,targetNode)
	if cType == 1:
		nodeType="CA"
		topNode=nodeCount(n,e)	
		createNode(topNode,nodeType)
	return rObject
示例#16
0
def word_phases(target_word, query_text, left_margin = 10, right_margin = 10):
    """
        Function to get all the phases that contain the target word in a text/passage tar_passage.
         
        str target_word, str tar_passage int left_margin int right_margin --> list of str
        left_margin and right_margin allocate the number of words/pununciation before and after target word
        Left margin will take note of the beginning of the text
    """
    ## Collect all the index or offset position of the target word
    c = nltk.ConcordanceIndex(query_text.tokens, key = lambda s: s.lower())
 
    ## Collect the range of the words that is within the target word by using text.tokens[start;end].
    ## The map function is use so that when the offset position - the target range < 0, it will be default to zero
    concordance_txt = ([query_text.tokens[list(map(lambda x: x-5 if (x-left_margin)>0 else 0,[offset]))[0]:offset+right_margin] for offset in c.offsets(target_word)])
                         
    ## join the sentences for each of the target phrase and return it
    return [''.join([x+' ' for x in con_sub]) for con_sub in concordance_txt]
def get_all_phrases_containing_tar_wrd(target_word,
                                       tar_passage,
                                       left_margin=10,
                                       right_margin=10):
    ## Create list of tokens using nltk function
    tokens = nltk.word_tokenize(tar_passage)

    ## Create the text of tokens
    text = nltk.Text(tokens)

    ## Collect all the index or offset position of the target word
    c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower())

    ## Collect the range of the words that is within the target word by using text.tokens[start;end].
    ## The map function is use so that when the offset position - the target range < 0, it will be default to zero
    concordance_txt = ([
        text.tokens[map(lambda x: x - 5 if
                        (x - left_margin) > 0 else 0, [offset])[0]:offset +
                    right_margin] for offset in c.offsets(target_word)
    ])

    ## join the sentences for each of the target phrase and return it
    return [''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt]
示例#18
0
def get_all_phrases(target_word, tar_passage, left_margin=10, right_margin=10):
    """
        Get all the phrases that contain the target word in a text tar_passage.
        Workaround to save the output given by nltk Concordance function.
        left_margin and right_margin allocate the number of words/pununciation before and after target word.

        :param target_word: str
        :param tar_passage: str
        :param left_margin: int
        :param right_margin: int
        :return: list
        """

    # Create list of tokens using nltk function
    tokens = nltk.word_tokenize(tar_passage)

    # Create the text of tokens
    text = nltk.Text(tokens)

    # Collect all the index or offset position of the target word
    c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower())

    # Collect the range of the words that is within the target word by using text.tokens[start;end].
    # The map function is used so that when the offset position - the target range < 0, it will be default to zero
    concordance_txt = ([
        text.tokens[list(
            map(lambda x: x - left_margin if
                (x - left_margin) > 0 else 0, [offset]))[0]:offset +
                    right_margin if (offset - left_margin) > 0 else offset +
                    right_margin + abs(offset - left_margin)]
        for offset in c.offsets(target_word)
    ])

    # join the sentences for each of the target phrase and return it
    return [
        ' '.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt
    ]
示例#19
0
def n_concordance_tokenised(text, phrase, left_margin=1, right_margin=1):
    phraseList = phrase.split(' ')
    c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower())
    offsets = [c.offsets(x) for x in phraseList]
    offsets_norm = []
    for i in range(len(phraseList)):
        offsets_norm.append([x - i for x in offsets[i]])
    intersects = set(offsets_norm[0]).intersection(*offsets_norm[1:])
    outputs = intersects
    for offset in intersects:
        concordance_txt_left = []
        concordance_txt_right = []
        concordance_txt_middle = []
        start_offset = offset - left_margin if (offset -
                                                left_margin) > 0 else 0
        end_offset = offset + len(phraseList) + right_margin
        for x in range(start_offset, offset):
            concordance_txt_left += [text.tokens[x]]
        for x in range(offset + len(phraseList), end_offset):
            concordance_txt_right += [text.tokens[x]]
        for x in range(offset, offset + len(phraseList)):
            concordance_txt_middle += [text.tokens[x]]
        yield ' '.join(concordance_txt_left), ' '.join(
            concordance_txt_middle), ' '.join(concordance_txt_right)
示例#20
0
script, inputfilename, expression = argv

raw = open('%s' % inputfilename).read()

tokens = nltk.wordpunct_tokenize(raw)

text = nltk.Text(tokens)

text.concordance('%s' % expression, width=40, lines=100)

text.findall('<%s><to><.*><.*>' % expression)

words = [w.lower() for w in text]

c = nltk.ConcordanceIndex(text.tokens)

unableset = [
    text.tokens[offset + 2] for offset in c.offsets('%s' % expression)
]

print len(unableset)

words = [w.lower() for w in unableset]

vocab = sorted(set(words))

print len(vocab)

fdist = nltk.FreqDist(unableset)
示例#21
0
f = open("crash.txt")
#f = open("crash.txt", 'r+', encoding="utf-8")

#f = open("crashchap1")
crash = f.read()  #+ff.read()
tokens = nltk.word_tokenize(crash)
text1 = nltk.Text(tokens)
#lines=text.concordance("injuries", 140, 1000)

# list of terms to re-permute
ll = {}

for rnge in range(21):
    ll[rnge] = []

c = nltk.ConcordanceIndex(text1.tokens, key=lambda s: s.lower())
for offset in c.offsets('crash'):
    for rnge in range(21):
        ll[rnge] = ll[rnge] + [text1.tokens[offset + (rnge - 10)]]

#print ll


# permute/randomise our x terms
def a(input):
    x = input[random.randint(0, len(input) - 1)]
    return x


sentences = ""
for x in range(100):
示例#22
0
def textRank(tokenized_words, tag_word_dict):
    
    words_set = syntactic_filter(tag_word_dict)

    # Add the vertex to the graph
    graph_dict = {}
    for w in words_set:
        graph_dict[w] = []
    
    graph = Graph(graph_dict)

    # Add the edges
    words = nltk.Text (tokenized_words)
    doc = nltk.ConcordanceIndex(words)
    for w in words_set:
        results = get_concordance(w, doc)

        for context in results:
            left = context[0].split()
            right = context[1].split()

            for l in left:
                if l in words_set:
                    graph.add_edge((w, l))
                    graph.add_edge((l, w))

            for l in right:
                if l in words_set:
                    graph.add_edge((w, l))
                    graph.add_edge((l, w))

    # Run the text rank algorithm
    delta = 1
    i = 0
    d = 0.85
    while (delta > 0.0001 and i < 5000):
        
        for v in graph.vertices():
            degree = graph.vertex_degree(v)
            old_rank = graph.text_rank(v)

            sum = 0
            for v2 in graph.adjacency_list(v):
                degree2 = graph.vertex_degree(v2)
                # print ("Degree for " + v2 + " = " + str(degree2))

                tr = graph.text_rank(v2)

                sum += tr / degree2

            value = (1 - d) + d * sum
            graph.set_text_rank(v, value)

            if abs(value - old_rank) < delta:
                delta = abs(value - old_rank)
        i = i + 1

    text_rank_dict = {}
    for v in graph.vertices():
        text_rank_dict[v] = graph.text_rank(v)

    sorted_text_rank = sorted(text_rank_dict.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_text_rank
示例#23
0
    def get_words_from_proximity(
            self, keyword_list,
            text):  #think of how to get nouns from sentence only... !!
        #create object of doc_frequency

        doc_freq_obj = doc_freq_class.context()

        tokens = nltk.word_tokenize(text)
        #print "tokens:"
        #print tokens
        for i in tokens:
            if i.isalnum() == False:
                tokens.remove(i)

        c = nltk.ConcordanceIndex(tokens, key=lambda s: s.lower())

        tokens_pos = nltk.pos_tag(tokens)
        i = 5
        doc_freq = []
        df_cnt = 0

        print "keywords going to loop",
        print keyword_list

        keywords = []
        for k in keyword_list:
            kw = nltk.word_tokenize(k)
            keywords.append(kw)

        print "keywords"
        print keywords

        for kw in keywords:
            print "keyword::::::::",
            print kw
            #split keyword not required as kw is list of strings
            #k = nltk.word_tokenize(kw)
            #print k

            #print "keywords in for ",
            #print kw
            first_word = kw[0]  #1st word in keyword
            #print "first word"
            #print first_word
            keyword_len = len(kw)
            #print "LEN="+str(keyword_len)
            i = 5
            nomatch = 0
            #print "IN KWD LOOP."
            print "offset",
            print c.offsets(first_word)
            doc_freq.append(document_frequency(kw))
            no_of_times = 0
            for offset in c.offsets(first_word):
                print kw
                j = 1
                i = 5
                #print "Keyword=",
                #print kw,
                #print " OFFSET=" + str(offset)
                nomatch = 0
                while j < keyword_len:
                    #print "in while"
                    #print tokens[offset+j]
                    #print kw[j]

                    if tokens[offset + j].lower() <> kw[j].lower():
                        #print tokens[offset+j]
                        #print k[j]
                        nomatch = 1
                        break
                    j = j + 1
                if nomatch == 0:

                    #print "matched kwd",
                    #print tokens[offset:offset+j-1]
                    #print tokens[offset-5:offset+5]
                    i = 5
                    while i > 0:
                        if (offset - i) < 0:
                            break

                        if (tokens_pos[offset - i][1] in [
                                "NN", "NNP"
                        ]) and (tokens_pos[offset - i][1].lower()
                                not in nltk.corpus.stopwords.words('english')):
                            #doc_freq_obj.get_together_DF("")
                            #print "dfcnt:" + str(df_cnt)
                            #print "i: " + str(i)
                            doc_freq[df_cnt].addneighbour(tokens_pos[offset -
                                                                     i][0])

                            print tokens_pos[offset - i][0],

                            #pass
                        i = i - 1

                    print "\m/ ",
                    print kw,
                    print "\m/ ",
                    i = 1

                    while i < 5:
                        if (offset + i + (keyword_len - 1)) >= len(tokens):
                            break

                        if (tokens_pos[offset + i + (keyword_len - 1)][1] in [
                                "NN", "NNP"
                        ]) and (tokens_pos[offset + i +
                                           (keyword_len - 1)][1].lower()
                                not in nltk.corpus.stopwords.words('english')):
                            #pass
                            doc_freq[df_cnt].addneighbour(
                                tokens[offset + i + (keyword_len - 1)])
                            print tokens_pos[offset + i +
                                             (keyword_len - 1)][0],

                        i = i + 1
                    k = 0
                    print "\n\n"
                    while k < doc_freq[df_cnt].cnt:
                        #doc_freq[df_cnt].neighbours[k].freq_word = fd1[context_vectors[CV_cnt].keyword]
                        doc_freq[df_cnt].neighbours[k].find_doc_freq(
                            doc_freq[df_cnt].keyword)
                        k = k + 1

                    doc_freq[df_cnt].neighbours.sort(
                        key=lambda x: x.freq_together, reverse=True)
                    if doc_freq[df_cnt].cnt > 5:
                        doc_freq[df_cnt].neighbours = doc_freq[
                            df_cnt].neighbours[:
                                               5]  #take 10 neighbours with highest weight
                        doc_freq[df_cnt].cnt = 5
                    k = 0
                    #while k < doc_freq[df_cnt].cnt:
                    print "keyword: ",
                    for l in doc_freq[df_cnt].keyword:
                        print l,
                    print "\n"
                    print "neighbours: ",
                    for m in doc_freq[df_cnt].neighbours:
                        print m.word,
                        print "\n"
                        #k += 1
                no_of_times = no_of_times + 1
                if no_of_times >= 2:
                    break

            #import pdb;pdb.set_trace();
            df_cnt = df_cnt + 1
        #results = search_web(doc_freq)
        print doc_freq
        return doc_freq
def main(argv, matches=2):

    fName = 'bbc/politics/' + str(argv)
    f = open(fName, 'r')
    raw_text = f.read()

    # Tokenize the tokenized_words of the text
    tokenized_words = nltk.word_tokenize(raw_text)

    # Making the tokenized_words to lower case
    for i in range(len(tokenized_words)):
        tokenized_words[i] = tokenized_words[i].lower()

    # POS tag the words
    tagged_words = nltk.pos_tag(tokenized_words)

    # Extracting the tags of the text
    tags = set([tag for (word, tag) in tagged_words])
    word_tag_dict = {}
    tag_word_dict = {}

    for (word, tag) in tagged_words:
        if word in word_tag_dict.keys():
            word_tag_dict[word.lower()].append(tag)
        else:
            word_tag_dict[word.lower()] = [tag]

        if tag in tag_word_dict.keys():
            tag_word_dict[tag].append(word)
        else:
            tag_word_dict[tag] = [word]

    words = nltk.Text(tokenized_words)
    doc = nltk.ConcordanceIndex(words)

    stemmer = PorterStemmer()

    # # Call text Rank
    # sorted_text_rank = textRank(tokenized_words, tag_word_dict)
    # set1 = set([w.lower() for (w, val) in sorted_text_rank[:15]])
    # removeList = []
    # for w in set1:
    #     if stemmer.stem(w) != w and stemmer.stem(w) in set1:
    #         removeList.append(w)

    # for w in removeList:
    #     set1.remove(w)

    # sorted_text_rank = [(w, val) for (w, val) in sorted_text_rank[:15] if w not in removeList]

    # offset_dict_text_rank = {}
    # for words1 in set1:
    #     offset_dict_text_rank[words1] = doc.offsets(words1)

    # Call tf
    sorted_tfValues = tf(tokenized_words, word_tag_dict)
    set2 = set([w.lower() for (w, val) in sorted_tfValues[:15]])
    removeList = []
    for w in set2:
        if stemmer.stem(w) != w and stemmer.stem(w) in set2:
            removeList.append(w)

    for w in removeList:
        set2.remove(w)

    sorted_tfValues = [(w, val) for (w, val) in sorted_tfValues[:15]
                       if w not in removeList]

    offset_dict_tf = {}
    for words2 in set2:
        offset_dict_tf[words2] = doc.offsets(words2)

    # # Call tf-idf
    # sorted_tf_idf = tfIdf (raw_text, word_tag_dict)
    # set3 = set([w for (w, val) in sorted_tf_idf[:15]])
    # removeList = []
    # for w in set3:
    #     if stemmer.stem(w) != w and stemmer.stem(w) in set3:
    #         removeList.append(w)

    # for w in removeList:
    #     set3.remove(w)

    # sorted_tf_idf = [(w, val) for (w, val) in sorted_tf_idf[:15] if w not in removeList]

    # offset_dict_tf_idf = {}
    # for words3 in set3:
    #     offset_dict_tf_idf[words3] = doc.offsets(words3)
    """ Printing the resuts"""
    # print (raw_text)

    # print ("\n\nText Rank of the document:")
    # printResult (sorted_text_rank, word_tag_dict, offset_dict_text_rank)
    # printTable (sorted_text_rank, offset_dict_text_rank)
    # printMatrix (offset_dict_text_rank)

    print("\n\nTf Scores of the document:\n")
    printResult(sorted_tfValues, word_tag_dict, offset_dict_tf)
    out_list, tid_word_dict = printTable(sorted_tfValues, offset_dict_tf)
    words_list = printMatrix(offset_dict_tf)
    print_top_sentence(raw_text, sorted_tfValues, matches, out_list,
                       tid_word_dict, words_list)

    print_sentences(raw_text, sorted_tfValues, tid_word_dict, words_list)
示例#25
0
def context(word):

    return nltk.ConcordanceIndex(tokens)
    text_parts = []
    for word_num, word in enumerate(words):
        if stemmer.stem(word) == target_stem:
            start = max(word_num - context_size, 0)
            stop = word_num + context_size + 1
            text_parts.append(words[start:stop])

    return text_parts


print "Version 2:"
text_parts = show_word_in_context2("scared", text)
print "Found {} occurences:".format(len(text_parts))
for part in text_parts:
    print " ".join(part)

# Bonus
# ~~~~~
# NLTK has some ready made concordance related objects. In particular, a
# possible solution to the problem could be:
print "Version 3:"
words = nltk.word_tokenize(text)
stemmer = nltk.LancasterStemmer()
c_stemmed = nltk.ConcordanceIndex(words, key=lambda s: stemmer.stem(s.lower()))
print c_stemmed.print_concordance("scared")

# The object offers more convenience functions. The locations for the matches
# are available with the `offsets` method. That allows to collect the words
# that follow the matches for example:
print[words[i + 1] for i in c_stemmed.offsets("scared")]
def find_word(file_lines, forward_sentence_type_in_a_list,
              forward_identify_section_span, word):

    result_for_csv_file_before = []

    for line_index_number, line_text in enumerate(file_lines):

        word_tokens = word_tokenize(line_text, language="english")

        sentence_word_count = len(word_tokens)

        if sentence_word_count > 0:

            nltk_concordance_index = nltk.ConcordanceIndex(word_tokens)

            for offset in nltk_concordance_index.offsets(word):

                identified_sentence_part_and_sub_part_as_tuple = get_sentence_type_from_sentence_index_number(
                    line_index_number, forward_sentence_type_in_a_list)
                part_value = identified_sentence_part_and_sub_part_as_tuple[0]
                sub_part_value = identified_sentence_part_and_sub_part_as_tuple[
                    1]

                if is_float(word_tokens[offset + 1]
                            and word_tokens[offset + 3]):

                    paragraph_number = get_section_from_sentence_index_number(
                        line_index_number, forward_identify_section_span)

                    relation = word_tokens[offset]
                    value_1 = word_tokens[offset + 1] + " " + word_tokens[
                        offset + 2] + " " + word_tokens[offset + 3]

                    if word_tokens[offset + 2] == "January" or word_tokens[
                            offset + 2] == "February" or word_tokens[
                                offset + 2] == "March" or word_tokens[
                                    offset + 2] == "April" or word_tokens[
                                        offset + 2] == "May" or word_tokens[
                                            offset +
                                            2] == "June" or word_tokens[
                                                offset +
                                                2] == "July" or word_tokens[
                                                    offset +
                                                    2] == "August" or word_tokens[
                                                        offset +
                                                        2] == "September" or word_tokens[
                                                            offset +
                                                            2] == "October" or word_tokens[
                                                                offset +
                                                                2] == "November" or word_tokens[
                                                                    offset +
                                                                    2] == "December":

                        # Remove the word "Section " for the Java application
                        only_output_section_number = paragraph_number.replace(
                            "Section ", "")

                        result_for_csv_file_before.append([
                            only_output_section_number, part_value,
                            sub_part_value, relation, value_1
                        ])

    return list(result_for_csv_file_before)