예제 #1
0
    def find_ml(self, td):
        f_tokenizer = TreebankWordTokenizer()
        query_words = f_tokenizer.tokenize(td)
        genres = self.sentiment_analysis(query_words)
        weighted_genres = []
        genre_weights = {}
        for x in genres:
            if x[1] is not None:
                weighted_genres.append(x[0])
                genre_weights[x[0]] = x[1]

        d_score_updates = {}
        for movie in self.movies:
            g = self.genre_dict[movie][0]
            total_genre_score = 0
            if u'Comedy' in g and 'comedy' in weighted_genres:
                total_genre_score += genre_weights['comedy']
            if u'Action' in g and 'action' in weighted_genres:
                total_genre_score += genre_weights['action']
            if u'Crime' in g and 'crime' in weighted_genres:
                total_genre_score += genre_weights['crime']
            if u'Drama' in g and 'drana' in weighted_genres:
                total_genre_score += genre_weights['drama']
            d_score_updates[self.movies.index(movie)] = total_genre_score * .1

        return d_score_updates
예제 #2
0
class CRCleaner(Cleaner):
    def __init__(self, input_dir, output_dir):
        super(CRCleaner,self).__init__(input_dir, output_dir, u"-\n'", punctuation+digits)
        self.t = TreebankWordTokenizer()
    
    def cleaned_text(self, text):
        if len(text) == 0:
            return u""
        sans_xml = self.xml_to_txt(text)
        arr = self.t.tokenize(sans_xml)
        return self.reconstruct_arr(arr)
    
    def xml_to_txt(self, xml):
        arr = []
        dom = parseString(xml)
        for node in (dom.firstChild.getElementsByTagName('speaking')+dom.firstChild.getElementsByTagName('speaking-unknown-id')):
            paragraphs = node.getElementsByTagName('paragraph')
            if len(paragraphs) > 0:
                for node2 in paragraphs:
                    if node2.hasChildNodes():
                        child = node2.firstChild
                        if child.nodeType == child.TEXT_NODE:
                            arr += [child.data.replace(' ',' ')]
        return ' '.join(arr)
    
    def new_filename(self, old_filename):
        return old_filename.replace('.xml', '.txt')
예제 #3
0
def pos_titles_from(input_path, output_path = None, options = None):
    finput, foutput = get_streams(input_path, output_path)
    skip, end = get_options(options)
    tokenizer = Tokenizer()
    tagger = PerceptronTagger()
    line_counter = 0
    skipped_lines = 0
    for line in finput:
        log_advance(1000000, line_counter)
        line_counter += 1
        if line_counter <= skip:
            continue
        if end and line_counter > end:
            break
        try:
            paper_id, title = get_fields(line)
            if is_english(title):
                print >> foutput, paper_id
                tokens = tokenizer.tokenize(title)
                for token in tagger.tag(tokens):
                    print >> foutput, token[0], token[1]
                print >> foutput
            else:
                skipped_lines += 1
        except:
            print >> sys.stderr, "Error:", line, sys.exc_info()
    log_nlines(line_counter, skipped_lines)
class TreebankWordTokenizerWrapper:
  """ Seriously I don't know why we need this class - this makes no sense """

  PAT_NLTK_BUG = re.compile(r"^(?:(.+)(,|'s))$")

  def __init__(self):
    self.word_tokenizer = TreebankWordTokenizer()

  def tokenize(self, s):
    temp = self.word_tokenizer.tokenize(s)
    if temp:
      it = []
      for t0 in temp:
        t = [t0]
        while True:
          m = self.PAT_NLTK_BUG.search(t[0])
          if m:
            t.insert(0, m.group(1))
            t[1] = m.group(2)
          else:
            break
        it += t
        #sys.stderr.write('DEBUG: t=%s => %s\n' % (t0, t))
    else:
      it = temp
    return it
예제 #5
0
def transformTweetData(tweet):
    content = unicode(tweet.sentence.lower(), errors='ignore')
    words = content.strip().split()
    tokenizer = TreebankWordTokenizer()
    extra_features = []
    content = " ".join(words + extra_features)
    tokens = tokenizer.tokenize(content)
    tokens = [t for t in tokens if t not in stopwords]
    return tokens
예제 #6
0
파일: utils.py 프로젝트: DSam1991/nlpnet
def tokenize_en(text):
    """
    Return a list of lists of the tokens in text, separated by sentences.
    """
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    tokenizer = TreebankWordTokenizer()
    sentences = [tokenizer.tokenize(sentence) 
                 for sentence in sent_tokenizer.tokenize(text)]
    return sentences
예제 #7
0
class DssgUnigramExtractor(object):

    """
    An instance of this is used to obtain a list of unigrams, given a text.
    Usages:
    unigramExtractor = DssgUnigramExtractor()
    tokenList = unigramExtractor.extract("here is a text as a string") # ['text', 'string']
    """

    _cache = {}

    def __init__(self):
        self._tokenizer = TreebankWordTokenizer()
        self._stopwordSet = set(stopwords.words("english"))
        self._stemmer = PorterStemmer()

    def __repr__(self):
        return self.__class__.__name__ + "()"

    def extract(self, text):
        """
        Given a text, return a list of unigram tokens.
        """
        if text not in DssgUnigramExtractor._cache:
            text = (
                text.replace("&lt;", "<")
                .replace("&gt;", ">")
                .replace("&quot;", '"')
                .replace("&amp;", "&")
                .replace("&nbsp;", " ")
            )
            text = nltk.clean_html(text)
            tokens = self._tokenizer.tokenize(text)

            newTokens = []
            for tok in tokens:
                # - lowercase, remove '
                tok = tok.lower().strip("`'.,-_*/:;\\!@#$%^&*()=\"")

                # - remove stopwords, one character word, only numbers
                # - remove one character word
                # - remove only numbers
                if tok in self._stopwordSet or len(tok) <= 1 or isAllNumbers(tok):
                    continue

                # - apply stemming
                # oldTok = copy.deepcopy(tok); # for debug
                tok = self._stemmer.stem(tok)
                # sometimes a token is like 'theres' and becomes stopword after
                # stemming
                if tok in self._stopwordSet:
                    continue

                newTokens.append(tok)
            DssgUnigramExtractor._cache[text] = newTokens
        return DssgUnigramExtractor._cache[text]
예제 #8
0
def pos_per_line(text_file):
    try:
        tokenizer = Tokenizer()
        #pos
        tagger = PerceptronTagger()
        for s in text_file:
            tokens = tokenizer.tokenize(s)
            #print " ".join([" ".join(token)  for token in tagger.tag(tokens)])
            print " ".join([token[1]  for token in tagger.tag(tokens)])
    except:
        print >> sys.stderr, "Error pos_per_line(text_file): ", sys.exc_info()
예제 #9
0
	def getNoun(self, parser, sentence):
		#mysent = sentence.encode('ascii','ignore')
		#sent = mysent.decode()
		penn = TreebankWordTokenizer()
		tags = parser.tag(penn.tokenize(sentence))
		the_tags = []
		nouns = []
		for t in tags:
			if t[1].startswith('NN'):
				nouns.append(t[0])
		return ' '.join(nouns)
예제 #10
0
def genLexicon(data):

	tok = TreebankWordTokenizer()

	texts = []
	for doc in data:
		for sent in doc:
			texts.append(tok.tokenize( sent[1].lower() ))

	dictionary = corpora.Dictionary(texts)

	pickle.dump(dictionary, open("lex/toy.lex", "w"))
예제 #11
0
class MorphyStemmer:
    def __init__(self):
        self.tokenizer = TreebankWordTokenizer()

    def __call__(self, doc):
        stemmed_doc = []
        for t in self.tokenizer.tokenize(doc):
            stem = wordnet.morphy(t)
            if stem:
                stemmed_doc.append(stem.lower())
            else:
                stemmed_doc.append(t.lower())
        return stemmed_doc
def crear_dicc_doc_term(path):
    result = []
    result_aux = []
    file = open(path)
    for f in file:
        result.append(f)
    tokenizer = TreebankWordTokenizer()
    for s in result:
        tokenizer = RegexpTokenizer("[\w']+")
        temp = tokenizer.tokenize(s)
        words = temp
        result_aux += eiminar_stopwords(words)
    return result_aux
예제 #13
0
def section_02_02( datDIR ):

    print("\n### ~~~~~ Section 02.02 ~~~~~~~~");

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    textfile = os.path.join( datDIR , "the-great-gatsby.txt" )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    with open(file = textfile, mode = 'r') as inF:
        sentences = []
        for i, tempLine in enumerate(inF):
            if i > 100:
                break
            tempLine = tempLine.strip()
            sentences.append(tempLine)
            print( "%5d: %s" % (i,tempLine) )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    mySentence = sentences[20] + " " + sentences[21]
    print("\nmySentence:")
    print(   mySentence  )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    #tokens = mySentence.split("([-\s.,;!?])+")
    tokens = re.split("([-\s.,;!?])+",mySentence)
    temp = list(filter(lambda x: x if x not in '- \t\n.,;!?' else None,tokens))
    print("\ntemp")
    print(   temp )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myPattern = re.compile("([-\s.,;!?])+")
    tokens = myPattern.split(mySentence)
    print("\ntokens[-10:]")
    print(   tokens[-10:] )

    temp = list(filter(lambda x: x if x not in '- \t\n.,;!?' else None,tokens))
    print("\ntemp")
    print(   temp )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myRegexpTokenizer = RegexpTokenizer("\w+|$[0-9.]+|\S+")
    print("\nmyRegexpTokenizer.tokenize(mySentence):")
    print(   myRegexpTokenizer.tokenize(mySentence)  )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myTreebankWordTokenizer = TreebankWordTokenizer()
    print("\nmyTreebankWordTokenizer.tokenize(mySentence):")
    print(   myTreebankWordTokenizer.tokenize(mySentence)  )
    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
예제 #14
0
 def word_tokenizePT(self,  text, tokenizer):
     """ tokenize a portuguese sentence in words
     @input params: sentence - a sentence, a phrase (self)
                    tokenizer - "TB" for TreebankWordTokenizer
                                "WP" for WordPunctTokenizer
     @returns word's list or error """
     if tokenizer == "TB":
         tokenizerTB = TreebankWordTokenizer()
         return tokenizerTB.tokenize(text)
     elif tokenizer == "WP":
         tokenizerWP = WordPunctTokenizer()
         return tokenizerWP.tokenize(text)
     else:
         return "tokenizer error: not found" 
예제 #15
0
def tf_normalized(full_texts):
    tokenizer = Tokenizer()
    tf = {}
    max_value = 0
    for text in full_texts:
        text_tokens = tokenizer.tokenize(text)
        text_tokens = escape_not_abbreviations(text_tokens)
        for token in text_tokens:
            token = token.lower()
            tf.setdefault(token, 0.0)
            tf[token] += 1.0
            if tf[token] > max_value:
                max_value = tf[token]
    for t in tf:
        tf[t] = tf[t]/max_value
    return tf
예제 #16
0
파일: prepare.py 프로젝트: Huarong/cloze
    def _compute_unigram_frequency(self):
        wordlists = PlaintextCorpusReader(self.prepared_training_data_root, '.*')
        tokenizer = TreebankWordTokenizer()
        total = len(wordlists.fileids())
        count = 0
        fdist = nltk.FreqDist()
        for fl in wordlists.fileids():
            count += 1
            fl_abs_path = os.path.join(self.prepared_training_data_root, fl)
            with open(fl_abs_path, 'r') as f:
                words = tokenizer.tokenize(f.read())
                fdist.update(words)
            print 'freqdist: %s of %s' % (count, total)

        with open(os.path.join(self.corpus_root, 'unigram_frequency.txt'), 'w') as f:
            f.writelines(['%s %s\n' % (word, freq) for (word, freq) in fdist.items()])
        return None
예제 #17
0
class Tokenizer(object):
    
    def __init__(self, language='english'):
        self.paragraph_tokenizer = nltk.data.load('tokenizers/punkt/%s.pickle' % language)
        self.sentence_tokenizer = TreebankWordTokenizer()
        self.english_stops = set(stopwords.words(language))
        
    def tokenize(self, text, remove_stopwords=False):
        sentences = self.paragraph_tokenizer.tokenize(text)
        token = []
        for sentence in sentences:
            words = self.sentence_tokenizer.tokenize(sentence)
            if remove_stopwords:
                token.append([word for word in words if word not in self.english_stops])
            else:
                token.append(words)
        return token
def eiminar_stopwords(words):
    a = open('english.txt')
    result = []
    english_stops = []
    for f in a:
        result.append(f)
    tokenizer = TreebankWordTokenizer()
    for s in result:
        tokenizer = RegexpTokenizer("[\w']+")
        temp = tokenizer.tokenize(s)
        english_stops += temp
    resultado = []
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()
    for w in words:
        if not w in english_stops:
            resultado.append(stemmer.stem(w))
    return resultado
예제 #19
0
파일: prepare.py 프로젝트: Huarong/cloze
 def _compute_biagram_frequency(self):
     if not os.path.exists(self.bigram_frequency_dir):
         os.mkdir(self.bigram_frequency_dir)
     wordlists = PlaintextCorpusReader(self.prepared_training_data_root, '.*')
     tokenizer = TreebankWordTokenizer()
     total = len(wordlists.fileids())
     count = 0
     for fl in wordlists.fileids():
         count += 1
         print 'freqdist: %s of %s' % (count, total)
         fl_abs_path = os.path.join(self.prepared_training_data_root, fl)
         with open(fl_abs_path, 'r') as f:
             words = tokenizer.tokenize(f.read())
             bi_words = nltk.bigrams(words)
             fdist = nltk.FreqDist(bi_words)
         with open(os.path.join(self.bigram_frequency_dir, fl), 'w') as f:
             f.writelines(['%s %s %s\n' % (word[0], word[1], freq) for (word, freq) in fdist.items()])
     return None
예제 #20
0
def text_fdist(text, min_occurence):
    from nltk.probability import FreqDist
    from nltk.tokenize import TreebankWordTokenizer

    tokenizer = TreebankWordTokenizer()

    #tokenise words:
    tokens = tokenizer.tokenize(text)
    #remove stopwords
    tokens = [
        token.lower() for token in tokens if token.lower() not in stopwords_fr
    ]
    print(tokens)

    fdist_in = FreqDist(tokens)

    #filter words with more than one occurence
    fdist = list(filter(lambda x: x[1] >= min_occurence, fdist_in.items()))
    return fdist
예제 #21
0
def tokenize(text, stopword=False, punct=False, lower=False,
             stem=False, num=False, single=False, link=False):
    """
    num: True, exclude numbers
    single: True, exclude single char
    todo: deal with unicode mafuckers
    """
    token = []
    tokenizer = TreebankWordTokenizer()
    token_temp = tokenizer.tokenize(text)
    for elt in token_temp:
        #temp = i.decode('unicode-escape')
        #temp = re.sub(ur'[\xc2-\xf4][\x80-\xbf]+',
        #             lambda m: m.group(0).encode('latin1').decode('utf8'), temp)
        temp = unicode(elt)
        temp = unicodedata.normalize('NFKD', temp).encode('ascii', 'ignore')

        # get rid of empty strings
        #temp = i
        if temp:
            token.append(temp)

    token = [clean_front_end(word) for word in token if clean_front_end(word)]

    if lower:
        token = [word.lower() for word in token]
    if stem:
        token = [stemmer.stem(word) for word in token]
    if num:
        token = [word for word in token if not is_number(word)]
    if single:
        token = [word for word in token if len(word) > 1]
    if stopword:
        token = [word for word in token if word not in STOPWORD]
    if punct:
        token = [word for word in token if word not in PUNCT]
    if link:
        token = [word for word in token if not is_link(word)]

    #exclude empty strings
    token = [word for word in token if word]

    return token
예제 #22
0
class nlp:
    def __init__(self):
        self.tb = tb
        self.porter = nltk.PorterStemmer()
        self.tk = TreebankWordTokenizer()
        self.stopwords = set(stopwords.words())
    def tag(self,text):
        blob = self.tb(text)
        return blob.tags
    #clean是词干化和标点符号的
    def noun(self,text,clean=True):
        text = text.replace('\\n',' ')
        text = text.replace('\\t',' ')
        blob = self.tb(text)
        tags = blob.tags
        result = []
        for (aword,atag) in tags:
            if atag == "NNP" or atag == "NNS" or atag == "NN":
                result.append(aword.lower())

        if clean == True:
            clean_result = []
            for word in result:
                nword = porter.stem(remove_non_chap(word))
                #nword = small_stem(remove_non_chap(word))
                if len(nword) > 2:
                    clean_result.append(nword)
            return clean_result
        return result
        
    #这个东西可能用着不太好,暂时先别用
    def noun_p(self,text):
        blob = self.tb(text)
        return blob.noun_phrases

    def token(self,text):
        result,clean_result = self.tk.tokenize(text),[]
        for word in result:
            nword = word.lower()
            nword = small_stem(nword)
            if len(nword) <= 30:
                clean_result.append(nword)
        return ' '.join(clean_result)
예제 #23
0
def preprocessing(para):
    print "\n\n\nStep 1: Preprocessing"
    print "Involves Processing of text"
    print "\n\nTokenizing Text into Sentences"

    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sent = sent_tokenizer.tokenize(para)
    print "The sentences are:"
    for s in sent:
        print s
    print "\n\nTokenizing Sentences into Words"
    tokenizer = TreebankWordTokenizer()
    tokens = []
    for s in sent:
        tokens.extend(tokenizer.tokenize(s))
    print "The words are:"
    print tokens

    return tokens
예제 #24
0
def prepro_sent(text, word_map):
    
    # tokenizers
    word_tokenizer = TreebankWordTokenizer()

    # tokenize sentences into words
    sentence = word_tokenizer.tokenize(text)[:word_limit]

    # number of words in sentence
    words_per_sentence = len(sentence)
    words_per_sentence = torch.LongTensor([words_per_sentence]).to(device)  # (1)

    # encode sentence with indices from the word map
    encoded_sent = list(
        map(lambda w: word_map.get(w, word_map['<unk>']), sentence)
    ) + [0] * (word_limit - len(sentence))
    encoded_sent = torch.LongTensor(encoded_sent).unsqueeze(0).to(device)

    return encoded_sent, words_per_sentence
예제 #25
0
def stopwords(filename):
    """A function that returns a dictionary with tokens as keys
    and counts of how many times each token appeared as values in
    the file with the given filename.

    Inputs:
        filename - the name of a plaintext file with a document on each line
    Outputs:
        A list of stopwords and a dictionary mapping tokens to counts.
    """
    
    # We now track the number of times a word shows up (term frequency) and
    # the number of documents with a given word in it (document frequency)
    # separately. We use a Counter, which is exactly like a dictionary except
    # - the values can only be ints
    # - any key it hasn't seen yet is assumed to already have a value of 0
    # This means we don't have to check whether we've used a key before when
    # we use the "+= 1" operation.
    term_frequency_dict = Counter()
    word_total = 0
    
    tokenizer = TreebankWordTokenizer()

    with open(filename, 'r') as f:
        for line in f:
            words = tokenizer.tokenize(line.lower())       

            # For the programmer types: there are several more efficient
            # ways to write this section using dictionaries or sets. You're
            # welcome to rewrite this part to exercise that.      
            for word in words:
                term_frequency_dict[word] += 1
                word_total += 1

    # A fun feature of Counters is that they have a built-in function that
    # gives you the n keys with the biggest values, or the "most common"
    # things being counted. We can use this to find the most common words.
    # This comes out as a list of pairs of key and value, like
    # [('foo', 10), ('bar', 7), ... , ('rare', 1)]
    stoplist_pairs = term_frequency_dict.most_common(100)
    stoplist = [word for (word, freq) in stoplist_pairs]
    
    return stoplist, term_frequency_dict, word_total
예제 #26
0
def tokenizarPorTipo():
    cadena = "Sorry, I can't go to the meeting.\n"
    print("TreebankWordTokenizer - 1")
    print("WhitespaceTokenizer - 2")
    print("SpaceTokenizer - 3")
    print("WordPunctTokenizer - 4")
    num = input("Introduzca un tokenizer: ")
    if num == "1":
        tokenizer = TreebankWordTokenizer()
    elif num == "2":
        tokenizer = WhitespaceTokenizer()
    elif num == "3":
        tokenizer = SpaceTokenizer()
    elif num == "4":
        tokenizer = WordPunctTokenizer()
    else:
        return

    tokens = tokenizer.tokenize(cadena)
    print(tokens)
예제 #27
0
    def post(self):
        args = post_args.parse_args()
        word = args.word
        res = []

        for root, dirs, files in os.walk(dataset_path):
            for file in files:
                filePath = root + "/" + str(file)
                fileOpen = open(filePath, "r", encoding="utf8")
                tokenizer = TreebankWordTokenizer()
                text = nltk.Text(tokenizer.tokenize(fileOpen.read()))
                r = fileOpen.read()
                texted = nltk.Text(text)
                ttokens = self.n_concordance_tokenised(text=texted,
                                                       phrase=word)
                for t in ttokens:
                    ans = t.partition(word)
                    res.append(ans)

        return jsonify(res)
예제 #28
0
    def compute_embeddings(self, texts, embedding_index):
        tokenizer = TreebankWordTokenizer()
        embeddings = []

        for text in texts:
            embedding = []

            for word in tokenizer.tokenize(text):
                word_embedding = self.compute_word_embedding(
                    word, embedding_index)
                if word_embedding is not None:
                    embedding.append(np.array(word_embedding))
                else:
                    # pad with 0s
                    zero_arr = np.zeros(25, )
                    embedding.append(zero_arr)
                    continue

            embeddings.append(embedding)
        return embeddings
예제 #29
0
파일: essay.py 프로젝트: mrmacthree/hlt
    def sentences(self, lowercase=False, strip_punct=[], num_placeholder=None):
        word_tokenizer=TreebankWordTokenizer()
        sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/english.pickle')
        token_sents = [word_tokenizer.tokenize(sent) for sent in sent_tokenizer.tokenize(self.response)]

        if lowercase:
            token_sents = [[token.lower() for token in sent] for sent in token_sents]

        if len(strip_punct) > 0:
            token_sents = [[token for token in sent if token not in strip_punct] for sent in token_sents]

        if num_placeholder is not None:
            def replace_num(token, placeholder):
                try:
                    float(token.replace(',',''))
                    return placeholder
                except ValueError:
                    return token
                
            token_sents = [[replace_num(token, num_placeholder) for token in sent] for sent in token_sents]
        return token_sents
예제 #30
0
def build(path):
    for filename in glob.glob(os.path.join(path, '*.txt')):
        with codecs.open(filename, 'r', 'utf-8') as f:
            for line in f:
                s = line.lower().strip('\n')
                # tokenizer = RegexpTokenizer('[a-z]\w+')
                tokenizer = TreebankWordTokenizer()
                tokens = tokenizer.tokenize(s)
                for t in tokens:
                    if t in invert_index:
                        files = invert_index[t]
                        # Update the word count by 1
                        if filename in files:
                            files[filename] += 1
                        # A new file contains this word
                        else:
                            invert_index[t][filename] = 1
                    else:
                        invert_index[t] = {filename: 1}

    pickle.dump(invert_index, open("invert_index.p", "wb"))
예제 #31
0
def main(argv):
    if 'run_test' in argv:
        run_test = True
    else:
        run_test = False
    if 'print_runtime' in argv:
        print_runtime = True
        start_time = time.time()
    else:
        print_runtime = False
    file_names = find_files()
    stop_word_set = set(stopwords.words('english'))
    if 'short_test' in argv:
        files_to_process = ['xin_eng_200201.xml.gz']
    else:
        files_to_process = file_names
    tokenizer = TreebankWordTokenizer()
    wonl = WordNetLemmatizer()
    output_filename = 'xinhua-om-lema.txt'
    output = open(output_filename, "wt")
    for downloaded_file in files_to_process:
        print('Working on {}'.format(downloaded_file))
        for paragraph in [paragraphs.text for paragraphs in ET.fromstring(gzip.open(downloaded_file).read())\
                          .findall(".//*[@type='story']//P")]:
            if not paragraph:
                continue
            for sentence in sent_tokenize(paragraph):
                filtered_words = [word for word in tokenizer.tokenize(sentence)\
                                  if word.lower() not in stop_word_set \
                                  and re.search("^[a-zA-Z]+$", word)]
                if not filtered_words:
                    continue
                output.write(' '.join(
                    [wnl.lemmatize(word).lower()
                     for word in filtered_words]) + '\n')
    output.close()
    if print_runtime:
        run_time = time.time() - start_time
        print('Total Processing Time: {0:.2f} minutes'.format(run_time / 60))
    return None
예제 #32
0
def index_search(query, index, idf, doc_norms):
    treebank_tokenizer = TreebankWordTokenizer()
    query_toks = treebank_tokenizer.tokenize(query.lower())
    scores = {}
    query_tf = Counter(query_toks)
    for term, term_tf in query_tf.items():
        if term in index:
            for (doc, shoe_id, tf) in index[term]:
                scores[doc] = scores.get(doc, 0) + term_tf * idf[term] * tf * idf[term]
    
    q_norm = 0
    for term, tf in query_tf.items():
        if term in index:
            q_norm += math.pow(tf * idf[term], 2)
    
    q_norm = math.sqrt(q_norm)
    
    res = []
    for doc, score in scores.items():
        res.append((score / (q_norm * doc_norms[doc]), doc))
    
    return sorted(res, key=lambda tup: (-tup[0], tup[1]))
예제 #33
0
def rate_sentence(sentence, Vfrase, listaNegativas, listaPositivas):
    negadores = []
    valor = 0
    tokenizer = TreebankWordTokenizer()
    tagger = nltk.data.load(_POS_TAGGER)
    tags = tagger.tag(tokenizer.tokenize(sentence))
    for i in tags:
        if (i[1] == 'NN') or (i[1] == 'NNS') or (i[1] == 'NNP') or (i[1]
                                                                    == 'NNPS'):
            valor += calcularValorPalabra(i[0], "sust", "N", Vfrase,
                                          listaNegativas, listaPositivas)
        if (i[1] == 'JJ' or (i[1] == 'JJR') or (i[1] == 'JJS')):
            valor += calcularValorPalabra(i[0], "adj", "N", Vfrase,
                                          listaNegativas, listaPositivas)
        if (i[1] == 'VB' or (i[1] == 'VBD') or (i[1] == 'VBG')
                or (i[1] == 'VBN') or (i[1] == 'VBP') or (i[1] == 'VBZ')):
            valor += calcularValorPalabra(i[0], "verb", "N", Vfrase,
                                          listaNegativas, listaPositivas)
        if (i[1] == 'RB' or (i[1] == 'RBR') or (i[1] == 'RBS')):
            valor += calcularValorPalabra(i[0], "adv", "N", Vfrase,
                                          listaNegativas, listaPositivas)
    return valor
예제 #34
0
def prepro_doc(document, word_map):

    # tokenizers
    sent_tokenizer = PunktSentenceTokenizer()
    word_tokenizer = TreebankWordTokenizer()

    # a list to store the document tokenized into words
    doc = list()

    # tokenize document into sentences
    sentences = list()
    for paragraph in get_clean_text(document).splitlines():
        sentences.extend([s for s in sent_tokenizer.tokenize(paragraph)])

    # tokenize sentences into words
    for s in sentences[:sentence_limit_per_doc]:
        w = word_tokenizer.tokenize(s)[:word_limit_per_sentence]
        if len(w) == 0:
            continue
        doc.append(w)

    # number of sentences in the document
    sentences_per_doc = len(doc)
    sentences_per_doc = torch.LongTensor([sentences_per_doc]).to(device)  # (1)

    # number of words in each sentence
    words_per_each_sentence = list(map(lambda s: len(s), doc))
    words_per_each_sentence = torch.LongTensor(words_per_each_sentence).unsqueeze(0).to(device)  # (1, n_sentences)

    # encode document with indices from the word map
    encoded_doc = list(
        map(lambda s: list(
            map(lambda w: word_map.get(w, word_map['<unk>']), s)
        ) + [0] * (word_limit_per_sentence - len(s)), doc)
    ) + [[0] * word_limit_per_sentence] * (sentence_limit_per_doc - len(doc))
    encoded_doc = torch.LongTensor(encoded_doc).unsqueeze(0).to(device)

    return encoded_doc, sentences_per_doc, words_per_each_sentence
예제 #35
0
 def _preprocess(self, listlikeobj, stop_lists=None):
     """Applies pre-processing pipelines to lists of string
     """
     
     numeric = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', \
                 'ten', 'Eleven', 'Twelve', 'Thirteen', 'Fourteen', 'Fifteen', 'Sixteen', 'Seventeen', \
                 'Eighteen', 'Nineteen', 'Twenty', 'Twenty-one', 'Twenty-two', 'Twenty-three', \
                 'Twenty-four', 'Twenty-five', 'Twenty-six', 'Twenty-seven', 'Twenty-eight', \
                 'Twenty-nine', 'Thirty', 'Thirty-one']
     
     ordinal = ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eight', 'ninth', \
                 'tenth', 'eleventh', 'twelfth', 'thirteenth', 'fourteenth', 'fifteenth', 'sixteenth', \
                 'seventeenth', 'eighteenth', 'nineteenth', 'twentieth', 'twenty-first', 'twenty-second', \
                 'twenty-third', 'twenty-fourth', 'twenty-fifth', \
                 'twenty-sixth', 'twenty-seventh', 'twenty eighth', 'twenty-ninth', 'thirtieth', 'thirty-first']
     
     
     en_stop = get_stop_words('en')
     tokenizer = TreebankWordTokenizer()
     p_stemmer = PorterStemmer()
     
     listlikeobj = listlikeobj.apply(lambda row: row.lower())
     listlikeobj = listlikeobj.apply(lambda row: tokenizer.tokenize(row))
     listlikeobj = listlikeobj.apply(lambda row: [i for i in row if i not in en_stop])
     listlikeobj = listlikeobj.apply(lambda row: [i for i in row if i not in string.punctuation])
     listlikeobj = listlikeobj.apply(lambda row: [p_stemmer.stem(i) for i in row])
     if stop_lists:
         for sw_dict in stop_lists:
             listlikeobj = listlikeobj.apply(lambda row: [i for i in row if i not in sw_dict])
     #listlikeobj = listlikeobj.apply(lambda row: [re.sub(r'\d', "#", i) for i in row])
     #listlikeobj = listlikeobj.apply(lambda row: ["#" for i in row if i in numeric])
     #listlikeobj = listlikeobj.apply(lambda row: ["#th" for i in row if i in ordinal])
     #print(listlikeobj)
     
     #listlikeobj = listlikeobj.apply(lambda row: [spell(i) for i in row if len(i)>6])
     
     
     return listlikeobj
예제 #36
0
 def GetWordCount2(self, data):
     #print(data)
     tokenizer = TreebankWordTokenizer()
     stop_words = set(stopwords.words('english'))
     words = []
     POSVals = {}
     wordcount = defaultdict(int)
     words = tokenizer.tokenize(data)
     for j in set(words):
         wordcount[j] = wordcount[j] + words.count(j)
     for (k, v) in list(wordcount.items()):
         if (k.lower() in stop_words
                 or k.lower() in list(string.punctuation)):
             del wordcount[k]
         else:
             # print(PosTags(k))
             POSVals[k] = self.PosTags(k)
     # print(POSVals)
     return {
         'WORDS': [k for k in sorted(wordcount.keys())],
         'COUNTS': [wordcount[k] for k in sorted(wordcount.keys())],
         'POS': [POSVals[k] for k in sorted(wordcount.keys())]
     }
예제 #37
0
def make_word_set(context):
    """ Computes the set of all words used in a list of strings.

    Arguments
    =========

    context: a list of strings

    Returns
    =======

    word_set: set of distinct words
    """
    tokenizer = TreebankWordTokenizer()
    sw = stopwords.words('english')
    word_list = []
    for string in context:
        tkns = tokenizer.tokenize(string)
        for tk in tkns:
            if tk not in sw:
                word_list.append(tk)
    word_set = set(word_list)
    return word_set
예제 #38
0
 def vectorize(self, dataset):
     print("vectorizing")
     if not self.embedding:
         GLOVE_DIR = "/media/D/data/glove/"
         GLOVE_W2V_FILE = "glove.840B.300d.w2vformat.txt"
         GLOVE_W2V_PATH = os.path.join(GLOVE_DIR, GLOVE_W2V_FILE)
         glove_model = gensim.models.KeyedVectors.load_word2vec_format(
             GLOVE_W2V_PATH)
         # print("time taken loading glove: {}".format(time.time()-t))
         self.embedding = glove_model.wv
     wv = self.embedding
     tokenizer = TreebankWordTokenizer()
     vectorized_data = []
     for sentence in dataset:
         sample_vecs = []
         for token in tokenizer.tokenize(sentence):
             try:
                 sample_vecs.append(wv[token])
             except KeyError:
                 # print(token, "not in wv")
                 pass
         vectorized_data.append(sample_vecs)
     return vectorized_data
예제 #39
0
class TextTokenizer(object):
    """分词
    去除标点符号,保留@,$,&,',TreebankWordTokenizer分词
    """
    def __init__(self):
        self.tokenizer = TreebankWordTokenizer()
        self.puncts = []  # 需保留的字符@,$,&

    def _clean_punct(self, string):
        # 去除标点符号
        new_string = re.sub(r'[,.;+=<>()/:_?!$@&%*|{}\-\[\]\"\']', ' ',
                            string)  # 保留@,$,&
        return new_string

    def _extract_punct(self, string):
        # 保留@,$,&
        """
        new_string = string
        for punct in self.puncts:
            if punct == '$':
                new_string = re.sub('\$', ' ' + punct + ' ', new_string)
            else:
                new_string = re.sub(punct, ' '+punct+' ', new_string)
        return new_string
        """
        return string

    def _tokenize(self, string):
        # TreebankWordTokenizer分词
        word_list = self.tokenizer.tokenize(string)
        return word_list

    def tokenize(self, string):
        # 集成处理
        new_string = self._extract_punct(self._clean_punct(string))
        word_list = self._tokenize(new_string)
        return word_list
예제 #40
0
def GetWordCount2(data):
    tokenizer = TreebankWordTokenizer()
    stop_words = set(stopwords.words('english'))
    words = []
    POSVals = {}
    wordcount = defaultdict(int)
    for i in data:
        if i == '\n':
            continue
        else:
            #i = i.encode('utf-8')
            words = tokenizer.tokenize(i)

            # print(words)

            for j in set(words):

                #j = j.decode('utf-8').strip()

                wordcount[j] = wordcount[j] + words.count(j)

                # print(wordcount)

    # print 'WORD::::::::::COUNT'

    for (k, v) in wordcount.items():
        if k.lower() in stop_words:
            del wordcount[k]
        else:
            #print(PosTags(k))
            POSVals[k] = PosTags(k)
    #print(POSVals)
    return {
        'WORDS': [k for k in sorted(wordcount.keys())],
        'COUNTS': [wordcount[k] for k in sorted(wordcount.keys())],
        'POS': [POSVals[k] for k in sorted(wordcount.keys())]
    }
예제 #41
0
def write_out(infile, out_folder):
    if not os.path.exists(out_folder):
        os.mkdir(out_folder)

    qfile = open(os.path.join(out_folder, 'a.toks'), 'w')
    afile = open(os.path.join(out_folder, 'b.toks'), 'w')
    lfile = open(os.path.join(out_folder, 'sim.txt'), 'w')

    qids = []
    questions = []
    answers = []
    labels = []

    tokenizer = TreebankWordTokenizer()

    qid_count = 0
    qid_old = None
    with open(infile) as inf:
        inf.readline()  # header
        for line in inf:
            fields = line.lower().strip().split('\t')
            qid = fields[0]
            question = ' '.join(tokenizer.tokenize(fields[1]))
            sentence = ' '.join(tokenizer.tokenize(fields[5]))
            label = fields[6]
            if qid != qid_old:
                qid_old = qid
                qid_count += 1
            qids.append(str(qid_count))
            questions.append(question)
            answers.append(sentence)
            labels.append(label)

    dump(questions, os.path.join(out_folder, 'a.toks'))
    dump(answers, os.path.join(out_folder, 'b.toks'))
    dump(labels, os.path.join(out_folder, 'sim.txt'))
    dump(qids, os.path.join(out_folder, 'id.txt'))
예제 #42
0
stemer = nltk.SnowballStemmer("russian")

inp_str = u"сегодня правительством ирана было объявлено о подавлении митингов благодаря напалму"
name_obj = u'правительство ирана'

#def find_object(name_obj,  inp_str, inp_doc_id):
otvet = []
res = {}
flag = False
name_obj = name_obj.split(",")
#ton_doc = TonDocuments.query.get(inp_doc_id)
ton_sents_list = []

for obj in name_obj:
    for sent in punkt_sent_token.tokenize(inp_str):
        tokens = tokenizer.tokenize(sent.lower())
        if obj in sent.lower():
            flag = True
        #if obj.lower().strip().encode("utf-8") in tokens:  #for sistem
        # if obj.lower().strip() in tokens:
        #     #sentiment_val = get_sentiment(sent, dics, tokenizer, punkt_sent_token, stemer)
        #     #otvet.append(sentiment_val)
        #     #ton_sents_list.append(TonSentences(sent.decode("utf-8"), sentiment_val[0][1], sentiment_val[0][0]))
        #     flag=True
        else:
            stem_tokens = []
            for t in tokens:
                # stem_tokens.append(stemer.stem(t.decode("utf-8"))) #for sistem
                stem_tokens.append(stemer.stem(t))
            stem_obj = stemer.stem(obj.strip())
            if stem_obj in ' '.join(stem_tokens):
예제 #43
0
    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))

    return dot_prod / (mag_1 * mag_2)


#文档123组成语料库,建立词汇表
docs = [
    "The faster Harry got to the store,the faster and faster Harry would get home."
]
docs.append("Harry is hairy and faster than Jill.")
docs.append("Jill is not as hairy as Harry.")
print(docs)  #17,8,8
doc_tokens = []
for doc in docs:
    doc_tokens += [sorted(tokenizer.tokenize(doc.lower()))]  #分词,大小写转化,未去重
print(len(doc_tokens[0]))
all_doc_tokens = sum(doc_tokens, [])
print(len(all_doc_tokens))
lexicon = sorted(set(all_doc_tokens))  #去重得到词库词汇表 18维
print(len(lexicon))
print(lexicon)

#构建向量模板,词库零向量,确保后面的向量维度相同,将内容填入其中,没有的用零代替
zero_vector = OrderedDict((token, 0) for token in lexicon)
print(zero_vector)

#每篇文档的向量表示
doc_vectors = []
for doc in docs:
    vec = copy.copy(zero_vector)
예제 #44
0
파일: lsi.py 프로젝트: jinhongkuan/codes
from gensim import corpora, models, similarities
from nltk.corpus import stopwords
from collections import defaultdict
from pprint import pprint
import nltk
from nltk.tokenize import TreebankWordTokenizer
import string
import os
from similarity import is_ci_stem_stopword_set_match
os.chdir(os.path.dirname(__file__))
documents = open("lsi_data.txt", "r").read().splitlines()
stop_words = stopwords.words('english')
tokenizer = TreebankWordTokenizer()
word_list = [[
    x.lower() for x in tokenizer.tokenize(sentence)
    if (x not in stop_words and x not in string.punctuation)
] for sentence in documents]
print(word_list)

frequency = defaultdict(int)
for sent in word_list:
    for token in sent:
        frequency[token] += 1

word_list = [[x for x in sent if frequency[x] > 1] for sent in word_list]
pprint(word_list)

dictionary = corpora.Dictionary(documents=word_list)

dictionary.save("LSA/doc1.dict")
print(dictionary.token2id)
예제 #45
0
nltk.download()

from nltk.tokenize import WhitespaceTokenizer, WordPunctTokenizer, TreebankWordTokenizer

text = "this is a block of text. I am writing a piece to explain the use of nlp packages."
text = 'Feet wolves talked cats'

######tokenize
tokenizer1 = WhitespaceTokenizer()  #extract based o white space
tokenizer2 = WordPunctTokenizer(
)  #extract based on the white space as well as punctuation
tokenizer3 = TreebankWordTokenizer()

tokens1 = tokenizer1.tokenize(text)
tokens2 = tokenizer2.tokenize(text)
tokens3 = tokenizer3.tokenize(text)

######
#best is first try to lemmetizing and then stem
from nltk.stem import PorterStemmer, WordNetLemmatizer

ps = PorterStemmer()
lem = WordNetLemmatizer()

lemmatized_tokens = []
for token in tokens3:
    lemmatized_tokens.append(lem.lemmatize(token))

#lemmatized and stemmed
lemmatized_tokens = []
for token in tokens3:
예제 #46
0
    
"""

##############################################################
# "Tokenizar" y "taggear" un texto:
# nltk.download("maxent_ne_chunker")
# Siempre que definamos una cadena en código lo haremos con el prefijo (u)
cadena = u"—¡Joven «emponzoñado» con el whisky, qué fin… te aguarda exhibir!\nEl veloz murciélago hindú comía feliz cardillo y kiwi.\nLa cigüena tocaba el saxofón detrás del palenque de paja.\nEl pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba a su querido cachorro.\nExhíbanse politiquillos zafios,\ncon orejas kilométricas\n\ty unas de gavilán."

print u"Cadena:"
print "\t", cadena

# Ejemplo normal de tokenizador por palabras (las palabras se capturan con los signos de puntuación adyacentes)
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(cadena)
# print u"\nPalabras:"
# print "\t","\n\t".join([addslashes(t) for t in tokens])

# Tokenizador que separa las palabras y luego los signos de puntuación
from nltk.tokenize import WordPunctTokenizer
word_punct_tokenizer = WordPunctTokenizer()
palabras = word_punct_tokenizer.tokenize(cadena)
# print u"\nPalabras/Puntuación:"
# print "\t","\n\t".join([addslashes(t) for t in palabras])

# Versión en español del tokenizador por frases
import nltk.data
spanish_tokenizer = nltk.data.load("tokenizers/punkt/spanish.pickle")
frases = spanish_tokenizer.tokenize(cadena)
# print u"\nFrases:"
예제 #47
0
from nltk.corpus import brown

text = "Are you curious about tokenization? Let's see how it " \
       "works! We need to analyze a couple of sentences with punctuations " \
       "to see it in action."

sent_tokenize_list = sent_tokenize(text)
print "\nSentence tokenizer:"
print sent_tokenize_list

print "\nWord tokenizer:"
print word_tokenize(text)

treebank_word_tokenizer = TreebankWordTokenizer()
print "\nTreebank word tokenizer:"
print treebank_word_tokenizer.tokenize(text)

word_punct_tokenizer = WordPunctTokenizer()
print "\nWord punct tokenizer:"
print word_punct_tokenizer.tokenize(text)

words = [
    'table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches',
    'grounded', 'dreamt', 'envision'
]
# Compare different stemmers
stemmers = ['PORTER', 'LANCASTER', 'SNOWBALL']
stemmer_porter = PorterStemmer()
stemmer_lancaster = LancasterStemmer()
stemmer_snowball = SnowballStemmer('english')
formatted_row = '{:>16}' * (len(stemmers) + 1)
                    file_kpe = os.path.join(dir_output, f[:-4] + ".ann")
                    kpe_file = open(file_kpe, "w")

                    kp_list = []
                    projections_list = kpc.get_document_content_ann(dirname, f[:-4] + ".ann")
                    for projection in projections_list:

                        index_list = projection[1].split()
                        start = int(index_list[1])
                        end = int(index_list[2])
                        prev_token = False

                        if start > 0:
                            prev_text = raw_text[0:start]
                            prev_text_tokens = tokenizer.tokenize(prev_text)
                            if prev_text_tokens:
                                prev_token = prev_text_tokens[-1]
                            else:
                                prev_token = False

                        next_text = raw_text[end:]
                        next_text_tokens = tokenizer.tokenize(next_text)
                        if next_text_tokens:
                            next_token = next_text_tokens[0]
                        else:
                            next_token = False

                        projection_tokens = tokenizer.tokenize(projection[2])

                        test_tokens = []
예제 #49
0
def tokenize(doc):
    tokenizer = TreebankWordTokenizer()
    token = tokenizer.tokenize(doc)
    #token = grams(token)
    return token
import ujson
wordsTokenizer = TreebankWordTokenizer()
stopWords = set(stopwords.words('english'))
sentencesTokenizer = load('tokenizers/punkt/english.pickle')
arquivoClassificados = open('classificados.json')
classificados = ujson.load(arquivoClassificados)
arquivoClassificados.close()
acertos = 0
sentimentos = {}
comeco = datetime.now()
for resposta in classificados:
	texto = resposta['corpo']
	frases = sentencesTokenizer.tokenize(texto)
	palavras = []
	for frase in frases:
		palavrasTemp = wordsTokenizer.tokenize(frase)
		palavras.extend([palavra for palavra in palavrasTemp if palavra not in stopWords])
	posTags = pos_tag(palavras)
	positivo = 0
	negativo = 0
	for palavra, tag in posTags:
		synsets = None
		if tag.startswith('J'):
			synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADJ)
		elif tag.startswith('V'):
			synsets = sentiwordnet.senti_synsets(palavra, wordnet.VERB)
		elif tag.startswith('N'):
			synsets = sentiwordnet.senti_synsets(palavra, wordnet.NOUN)
		elif tag.startswith('R'):
			synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADV)
		else:
nltk.download('punkt')
nltk.download('treebank')

from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import TreebankWordTokenizer

tb_tokenizer=TreebankWordTokenizer()

text1 = "Love looks not with the eyes, but with the mind. And therefore is wing'd Cupid painted blind."
text2 = "South Korea population is 48,750,000"

word_tok = word_tokenize(text1)
word_tok2 = word_tokenize(text2)

wordpunct_tok = WordPunctTokenizer().tokenize(text1)
wordpunct_tok2 = WordPunctTokenizer().tokenize(text2)

tb_tok = tb_tokenizer.tokenize(text1)
tb_tok2 = tb_tokenizer.tokenize(text2)

print("word_tokenize를 사용한 경우는 아래와 같습니다.")
print(word_tok)
print(word_tok2)
print("wordpunct_tokenize를 사용한 경우는 아래와 같습니다.")
print(wordpunct_tok)
print(wordpunct_tok2)
print("Treebanktokenize를 사용한 경우는 아래와 같습니다.")
print(tb_tok)
print(tb_tok2)
                    for ann in ann_file:
                        ann = unicode(ann, encoding="utf-8")
                        if ann[0] not in ["R", "*"]:
                            ann_items = ann.strip().split("\t")
                            if ann_items[1].find(";") >= 0:
                                type_indexes_tmp = ann_items[1].split(" ")
                                type_indexes = type_indexes_tmp[0:2] + type_indexes_tmp[3:]
                            else:
                                type_indexes = ann_items[1].split(" ")
                            type_indexes[1] = int(type_indexes[1])
                            type_indexes[2] = int(type_indexes[2])
                            indexes_kp_tmp.setdefault(type_indexes[1], -1)
                            if indexes_kp_tmp[type_indexes[1]] < type_indexes[2]:
                                indexes_kp_tmp[type_indexes[1]] = type_indexes[2]
                            ann_text = ann_items[2]
                            tokens = tokenizer.tokenize(ann_text)
                            if without_types:
                                annotation_type = 'KeyPhrase'
                            else:
                                annotation_type = type_indexes[0]
                            pos_tags = [t + (annotation_type,)  for t in tagger.tag(tokens)]
                            if pos_tags:
                                pos_tags[0] = pos_tags[0][0:2] + ("B-" + pos_tags[0][2],)
                                if debug:
                                    print >> sys.stderr, pos_tags
                            annotations[" ".join([str(ti) for ti in type_indexes[1:]])] = pos_tags
                            #print >> ann_ext_file, " ".join([str(ti) for ti in type_indexes]) + "\t" + ann_text + "\t" + pos_tags
                    ann_file.close()
                    #ann_ext_file.close()

                    if debug:
tokenizerPalavras = TreebankWordTokenizer()
arquivoClassificador = open('classificador.pickle', 'rb')
classificador = _pickle.load(arquivoClassificador)
arquivoClassificador.close()
arquivoClassificados = open('classificados.json')
classificados = ujson.load(arquivoClassificados)
arquivoClassificados.close()
sentimentos = {}
featuresClassificados = []
comeco = datetime.now()
for resposta in classificados:
	texto = resposta['corpo']
	frases = tokenizerFrases.tokenize(texto)
	feature = {}
	for frase in frases:
		palavras = tokenizerPalavras.tokenize(frase)
		palavras = [palavra for palavra in palavras if palavra not in stopWords]
		for palavra in palavras:
			feature[palavra] = True
	sentimentos[texto] = (resposta, classificador.classify(feature))
	featuresClassificados.append((feature, resposta['sentimento']))
tempo = datetime.now() - comeco
arquivoMedicoes = open('medicoes_analise_sequencial.txt', 'w')
arquivoMedicoes.write('Tempo de Execução = ' + str(tempo) + '\nPrecisão = {0:.2f}%'.format(accuracy(classificador, featuresClassificados) * 100))
arquivoMedicoes.close()
arquivoResultados = open('resultados_sem_stopwords.csv', 'w', newline='')
w = writer(arquivoResultados, delimiter=',')
linhas = [['Resposta', 'Pontos', 'Sentimento - Naive Bayes', 'Sentimento - AlchemyAPI']]
for texto in sentimentos.keys():
	tupla = sentimentos[texto]
	resposta = tupla[0]
예제 #54
0
class WordTokenizer(object):
	def __init__(self):
		self._word_tokenizer = TreebankWordTokenizer()
	def tokenize(self, document):
		return self._word_tokenizer.tokenize(document)
예제 #55
0
from nltk.tokenize import word_tokenize
print(
    word_tokenize(
        "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."
    ))

from nltk.tokenize import WordPunctTokenizer
print(WordPunctTokenizer().tokenize(
    "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."
))

from tensorflow.keras.preprocessing.text import text_to_word_sequence
print(
    text_to_word_sequence(
        "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."
    ))

from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
text = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."
print(tokenizer.tokenize(text))
예제 #56
0
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer

PHRASE = 'Cats pants and wolves'

tokenizer = TreebankWordTokenizer()
porter = PorterStemmer()
word_net = WordNetLemmatizer()

tokens = tokenizer.tokenize(PHRASE)
print(tokens)

print("Porter   : ", ' '.join(porter.stem(token) for token in tokens))
print("Word Net : ", ' '.join(word_net.lemmatize(token) for token in tokens))
예제 #57
0
        #test_sents = []
        for (dirname, _, filenames) in os.walk(dir_corpus):
            for f in filenames:
                ext = f[-4:]
                if ext == '.ann':
                    file_count += 1
                    if debug and file_count > debug_tests:
                        break
                    
                    file_text = os.path.join(dirname, f[:-4] + ".txt")
                    text_file = open(file_text, "r")
                    file_kpe = os.path.join(dir_output, f[:-4] + ".ann")
                    kpe_file = open(file_kpe, "w")

                    raw_text = unicode(text_file.read(), encoding="utf-8")
                    tokens = tokenizer.tokenize(raw_text)
                    tagged_text = [t + ("None",)  for t in tagger.tag(tokens)]
                    text_file.close()
                    #test_sents.append(tagged_text)
                    if extra_features:
                        X_test = kpc.sent2features_extra(tagged_text, qr)
                    else:
                        X_test = kpc.sent2features(tagged_text)

                    is_not_kp = "None"
                    tmp_label = is_not_kp
                    new_kp = []
                    kp_list = []
                    for kp in zip(crftagger.tag(X_test), [tt[0] for tt in tagged_text]):
                        if debug and False:
                            print >> sys.stderr, "    ---- ", kp
#!/usr/bin/env python

import sys
import string
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer

stop_words_english = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'yo', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn']

# Input : subreddit, created_utc, subreddit_id, link_id, name, id, gilded, author, score, body, controversiality, parent_id, compound, neg, neu, pos, sentiment_class
# Output : subreddit, sentiment_class, gilded, score, body, compound, neg, neu, pos

for line in sys.stdin:
    line = line.strip()
    row = line.split(",")
    subreddit = row[0]
    body = row[9]
    sentiment = row[16]

    tokenizer = TreebankWordTokenizer()
    word_tokens = tokenizer.tokenize(body.lower())
    
    filtered_words = [word for word in word_tokens if word not in stop_words_english]
    
    new_comment = ''
    for word in filtered_words:
        new_comment += ''.join([i if i.isalpha() or ord(i)==32 else '' for i in word])+' '
    
    print "%s,%s,%s,%s,%s,%s,%s,%s,%s" % (subreddit, sentiment, row[6], row[8], new_comment, row[12], row[13], row[14], row[15])
    
def tokenizer(text):
    tbwt = TreebankWordTokenizer()
    text_out = tbwt.tokenize(text)
    return text_out