예제 #1
0
def bag_of_words(data, label_codebook, feature_codebook, theta):
    """"""
    word_dict = Alphabet()
    stopset = set(stopwords.words('english'))
    for key, value in data.items():
        label_codebook.add(key)
        for doc in value:
            doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+"))
            for word in doc_tokens:
                if word not in stopset:
                    word_dict.add(word)
                    
    all_words = word_dict._label_to_index.keys()
    fdict = FreqDist([w for w in all_words])
    word_feature = fdict.keys()[theta:]
    for word in all_words:
        if word in word_feature:
            feature_codebook.add(word)
    
    instance_list = {}
    for label, document_list in data.items():
        instance_list[label] = []
        for document in document_list:
            vector = np.zeros(feature_codebook.size())
            tokens = set(nltk.regexp_tokenize(document, pattern="\w+"))
            indice = 0
            
            for word in tokens:
                if feature_codebook.has_label(word):
                    indice = feature_codebook.get_index(word)
                    vector[indice] = 1.
            instance_list[label].append(vector)
    return instance_list
예제 #2
0
def get_freqs(text):

    stop_words = nltk.corpus.stopwords.words('english')
    frequencies = defaultdict(int)

    pattern = r'''(?x)    # set flag to allow verbose regexps
                    ([A-Z]\.)+        # abbreviations, e.g. U.S.A.
                    | \w+(-\w+)*        # words with optional internal hyphens
                    | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
                    | \.\.\.            # ellipsis
                    | [][.,;"'?():-_`]  # these are separate tokens
                     '''

    if type(text) == list:
        print 'number of links: '+ str(len(text))
        for t in text:
            content = t['content']
            tokens = nltk.regexp_tokenize(content, pattern)
            for word in tokens:
                if len(word) > 2 and word.lower() not in stop_words:
                    cap = word[0].upper() + word[1:]
                    frequencies[cap] += 1
    else:
        tokens = nltk.regexp_tokenize(text, pattern)
        for word in tokens:
            if len(word) > 2 and word not in stop_words:
                frequencies[word] += 1
    print "frequency size: "+str(len(frequencies))
    return frequencies
예제 #3
0
def load(f=str):
    import re
    files = open(f)
    raw = files.read()
    pattern = re.compile(r"""\$?\d+(\.\d+)?%?    # currency
                             \d+/\d+/\d+         #dates""", re.VERBOSE)
    nltk.regexp_tokenize(raw, pattern)
예제 #4
0
def nltkTest():
    s = "russia licenza 8.1.5 U.S."
    res = nltk.regexp_tokenize(s, helper.nltkPattern)
    print(res)

    s = "Saldo vs. Fattura n. 2015/004"
    res = nltk.regexp_tokenize(s, helper.nltkPattern)
    print(res)
예제 #5
0
파일: chapter3.py 프로젝트: hbdhj/python
def regularExpressionTokenizer():
    text = 'That U.S.A. poster-print costs $12.40...'
    pattern = r'''(?x)         # set flag to allow verbose regexps 
            ([A-Z]\.)+        # abbreviations, e.g. U.S.A. 
          | \w+(-\w+)*        # words with optional internal hyphens 
          | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82% 
          | \.\.\.            # ellipsis 
          | [][.,;"'?():-_`]  # these are separate tokens 
    '''
    print nltk.regexp_tokenize(text, pattern)
def get_links(text):
    # checks only for  'http://...' and 'www...'
    text = text + " "
    pat = "http://.*?\s"
    links = nltk.regexp_tokenize(text, pat)
    text = " " + text + " "
    pat = "\swww\..*?\..*?\s"
    links.extend(nltk.regexp_tokenize(text, pat))
    links = map(lambda x: x[:-1], links)
    return links
예제 #7
0
파일: stem.py 프로젝트: rve/keyword
def poss_test(test_file,test_write,sw_file):
    """
    
    Arguments:
    - `train_file`:
    """
    a = 0
    f = open(test_file)
    reader = csv.reader(f)

    t = open(test_write,"w")

    sw = open(sw_file)
    sw = sw.readlines()
    sw = [word.strip() for word in sw]
    
    stopwords = sw
    print "停顿词表长度",len(stopwords)
    stopwords = set(stopwords)

    g = lambda x : x not in stopwords
    
    for row in reader:
        if a == 0:
            a += 1
            continue
        if a%1000 == 0:
            print a    
        a += 1
        #if a == 8:
        #    sys.exit(1)

        title = row[1].lower()
        #clean html
        body = nltk.clean_html(row[2].lower())
        
        #work tokenize
        pattern = r"([a-z])\w+"
        body = nltk.regexp_tokenize(body, pattern)
        title = nltk.regexp_tokenize(title, pattern)

        #light stem
        #title = set([stem(word) for word in title])
        #body = set(body)
        #body = set([stem(word) for word in body])

        #remove stopwords
        #body = filter(g,body)
        #title = filter(g,title)

        body = ' '.join(body)
        title = ' '.join(title)
        t.write('%s , %s \n'%(title,body))
예제 #8
0
def poss_test(test_file,test_write,sw_file):
    """
    
    Arguments:
    - `train_file`:
    """
    a = 0
    f = open(test_file)
    reader = csv.reader(f)

    t = open(test_write,"w")

    sw = open(sw_file)
    sw = sw.readlines()
    sw = [word.strip() for word in sw]
    
    #stopwords = sw 
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords = set(stopwords)

    g = lambda x : x not in stopwords
    
    for row in reader:

        if a%10000 == 0:
            print(a)
        a += 1
        #if a == 8:
        #    sys.exit(1)

        title = row[1].lower()
        #clean html
        body = nltk.clean_html(row[2].lower())
        
        #work tokenize
        pattern = r"(\.?[a-z][a-z0-9\+\.\#\-]+[a-z0-9\+\#])"
        body = nltk.regexp_tokenize(body, pattern)
        title = nltk.regexp_tokenize(title, pattern)
        #remove stopwords
        body = filter(g,body)
        title = filter(g,title)

        #light stem
        title = set([stem(word) for word in title])
        body = set(body)
        body = set([stem(word) for word in body])


        body = ' '.join(body)
        title = ' '.join(title)
        t.write('"%s","%s","%s"\n'%(row[0],title,body))
예제 #9
0
    def query_episode(self, show_title, 
        ep_title, se_number, ep_number, runtime):
        """build video list prior to scoring
        """
        qres = {}

        # Query 1
        qlist = (show_title, ep_title)
        # Search YouTube
        tmp = self.search('%s %s' % qlist)
        for k, v in tmp.items():
            qres[k] = v
        # Query 2
        qlist = (show_title, ep_title, 
            se_number, ep_number)
        # Search YouTube
        tmp = self.search('%s %s  %s  %s' % qlist)
        for k, v in tmp.items():
            qres[k] = v
        # Query 3
        qlist = (show_title, 
            se_number, ep_number)
        # Search YouTube
        tmp = self.search('%s s%02de%02d' % qlist)
        for k, v in tmp.items():
            qres[k] = v

        # Show tokens
        sh_stem = [self._lancaster.stem(t) \
            for t in nltk.regexp_tokenize(
                show_title.encode('utf8'), r"\w+")]

        # Episode stem tokens if exist
        if ep_title:
            ep_stem = [self._lancaster.stem(t) \
                for t in nltk.regexp_tokenize(
                    ep_title.encode('utf8'), r"\w+")]
        else:
            ep_stem = None

        res = {'Output': qres, 
               'Input': {},}
        res['Input']['show_title'] = show_title
        res['Input']['ep_title'] = ep_title
        res['Input']['sh_stem'] = sh_stem
        res['Input']['ep_stem'] = ep_stem
        res['Input']['se_number'] = se_number
        res['Input']['ep_number'] = ep_number
        res['Input']['runtime'] = runtime

        return res
예제 #10
0
파일: pre_nltk.py 프로젝트: rve/keyword
def poss_train(train_file,train_write,sw_file):
    """
    
    Arguments:
    - `train_file`:
    """
    a = 0
    f = open(train_file)
    reader = csv.reader(f)

    t = open(train_write,"w")

    sw = open(sw_file)
    sw = sw.readlines()
    sw = [word.strip() for word in sw]
    
    #stopwords = sw  # use nltk stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    print "停顿词表长度",len(stopwords)
    stopwords = set(stopwords)

    g = lambda x : x not in stopwords
    
    for row in reader:
        if a%100000 == 0:
            print a    
        a += 1
        title = row[1].lower()
        #clean html
        body = nltk.clean_html(row[2].lower())
        
        #word tokenize
        pattern = r"([a-z])\w+"
        body = nltk.regexp_tokenize(body, pattern)
        title = nltk.regexp_tokenize(title, pattern)
        
        #remove stopwords
        body = filter(g,body)
        title = filter(g,title)

        #light stem
        #st = LancasterStemmer()
        title = set([stem(word) for word in title])
        body = set(body)
        body = set([stem(word) for word in body])

        # list to string
        body = ' '.join(body)
        title = ' '.join(title)
        t.write('"%s","%s","%s","%s"\n'%(row[0], title,body,row[3]))
예제 #11
0
def normalized(text, lowercase=True, fix=True, tuples=False):
    """Tokenize, remove capitalization and exclude punctuation
    """
    if fix:
        text = fix_text(unicode(text))
    pattern = r"""(?x)    # verbose regexps
        \w+(-\w+)*        # words with optional internal hyphens
    """
    result = [w for w in nltk.regexp_tokenize(text, pattern)]
    if lowercase:
        result = [w.lower() for w in nltk.regexp_tokenize(text, pattern)]
    if tuples:
        result = tuple(result)
    return result
예제 #12
0
def handleSubject1(outputFile):
    """
    :return: dict
    """
    index = 0
    termdict = dict()
    subjectList = list()

    f = open("data/topic/subject1_w_date.txt")
    for item in f:
        array = item.strip().split("DELIMER")
        count = array[0]
        subject = array[3]

        for (regex, repl) in helper.regexList.items():
            subject = regex.sub(repl, subject)
        for s in helper.specialSet:
            subject = subject.replace(s, "")

        termList = nltk.regexp_tokenize(subject, helper.nltkPattern)  # use nltk-package to participle the subject
        s = ""
        for term in termList:
            if term.lower() not in helper.excludeSet:
                s += term + " "  # reconstruct the subject
                if term not in termdict:
                    termdict[term.strip()] = index
                    index += 1

        if s != "":
            regex = re.compile("\s+")
            s = regex.sub(" ", s)
            subjectList.append("{}DELIMER{}DELIMER{}DELIMER{}".format(count, array[1], array[2], s.strip()))

    fileHelper.writeIterableToFile(outputFile, subjectList)
    return termdict
def tokenprocess(Strtext): 
   
    f = open(Strtext)
    raw = f.read().strip()
    stop_words = stopwords.words('english')
    
    pattern = r'''(?x)([A-Z]\.)+|\w+(-\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_`]'''
    text1 = map(lambda word:word.lower(),nltk.regexp_tokenize(raw,pattern))
    text1_filter = [word for word in text1 if len(word) > 1 and word.find("'") == -1 and word not in stop_words]
    
  
    return text1_filter




#eassylist = corpus_data()
#print eassylist[-1]

#stop_words = stopwords.words('english')
#print stop_words


    
    
    
    
예제 #14
0
 def tag(text):
     tokens = nltk.regexp_tokenize(text, SENTENCE_REGEX)
     pos_tokens = nltk.tag.pos_tag(tokens)
     chunker = nltk.RegexpParser(GRAMMAR)
     tree = chunker.parse(pos_tokens)
     terms = Tagger.get_terms(tree)
     return Tagger.word_list(terms)
def russian_get_text(inp, output):
    # parse tweets from .csv file by Julia Rubtsova
    # from 'Метод построения и анализа корпуса коротких текстов для задачи классификации отзывов'
    data = read_data(inp)

    res = []
    pattern = '''"(.*?)";'''
    for line in data:
        tokens = nltk.regexp_tokenize(line, pattern)
        if len(tokens) < 4:
            continue

        text = tokens[3][1:-2]
        mentions = get_mentions(text)
        links = get_links(text)
        hashtags = get_hashtags(text)
        text = process(text)
        sname = tokens[2][1:-2]
        if text == '':
            continue

        row = [text, 'not-given', sname, 'not-given', ','.join(hashtags), ','.join(mentions), ','.join(links)]
        row = '\t'.join(row)

        res.append(row)

    write_data(output, res)
예제 #16
0
def ShowCollocations():
	text.insert(END, "If this doesn't work, please check you have NLTK, PyYAML and the stopword list from the NLTK loaded. See Help for details \n\n\n")
	import nltk
	from nltk.collocations import BigramCollocationFinder
	from nltk.collocations import TrigramCollocationFinder
	from nltk.metrics import BigramAssocMeasures
	from nltk.metrics import TrigramAssocMeasures
	pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']'''
	data = resultsbox.get(1.0,END)
	rawtext=nltk.regexp_tokenize(data, pattern)
	prepcolloc = [word.lower() for word in rawtext if not word in stopwords and word.isalpha()]
	text.delete(1.0, END)
	text.insert(END, "Collocations (occurring at least 3 times with a PMI of 10)\n")
	text.insert(END, "\nBigram Collocations:\n")
	bigram = BigramAssocMeasures()
	bigramfinder = BigramCollocationFinder.from_words(prepcolloc)
	bigramfinder.apply_freq_filter (3)
	bigrams=bigramfinder.nbest(bigram.pmi, 10)
	for item in bigrams:
		first = item[0]
		second = item[1]
		text.insert(END, first)
		text.insert(END, " ")
		text.insert(END, second)
		text.insert(END, "\n")
예제 #17
0
    def main(self, text):
        """Breaks a single string into a tree using the grammar and returns
        the specified words as a string."""

        if text is None:
            return None

        try:
            text = text.encode("ascii", "ignore")
        except:
            text = text.decode("utf-8", "ignore").encode("ascii", "ignore")

        chunker = nltk.RegexpParser(grammar)

        toks = nltk.regexp_tokenize(text, sentence_re)
        postoks = nltk.tag.pos_tag(toks)

        #print postoks
        tree = chunker.parse(postoks)

        terms = self.get_terms(tree)

        words = self.get_words(terms)

        return words
예제 #18
0
def longitud_promedio_palabras_moens(lista):
    regexp = "[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+"
    total_palabras_en_oraciones = 0
    num_oraciones = 0
    tokens = 0
    promedio_longitud_palabras_oraciones = []
    for oracion in lista:
        total_palabras_oracion = 0
        num_palabras_oracion = 0
        tokens = nltk.regexp_tokenize(oracion, regexp)
        total_palabras_en_oraciones += len(tokens)
        for palabra in tokens:
            total_palabras_oracion += len(palabra)
            num_palabras_oracion += 1
            #print palabra
            #print len(palabra)
        if total_palabras_oracion > 0:
            promedio_longitud_palabras_oraciones.append(total_palabras_oracion/num_palabras_oracion)
        else:
            print oracion
        #print len(tokens)
        #total += len(oracion.split())
        num_oraciones += 1
    #promedio = total_palabras_en_oraciones / num_oraciones
    #print promedio_longitud_palabras_oraciones
    suma_promedios=0
    num_promedios = 0
    for promedios in promedio_longitud_palabras_oraciones:
        suma_promedios += promedios
        num_promedios += 1
    promedio = suma_promedios/num_promedios
        
    #promedio = sum(promedio_longitud_palabras_oraciones)/float(len(promedio_longitud_palabras_oraciones))    
    return promedio
def classif(text, mass, num_all_docs, num_words_unic):
    stm = Stemmer('russian')
    text = stm.stemWords(regexp_tokenize((text.decode('UTF-8')).lower(), r"(?x) \w+ | \w+(-\w+)*"))
    num_povt_words = 0
    summa = 0
    while_iter = 0
    while while_iter < len(mass):
        summand_1 = log((mass[while_iter].num_docs + 0.0) / (num_all_docs + 0.0) + 0.0, 1.1)
        for i in text:
            for i1 in mass[while_iter].lst_allword:
                if i == i1:
                    num_povt_words = num_povt_words + 1
            summand_2 = log(((num_povt_words + 1) + 0.0) / ((num_words_unic + mass[while_iter].num_words) + 0.0), 1.1)
            num_povt_words = 0
            summa = summa + summand_2
        mass[while_iter].c = summand_1 + summa
        summa = 0
        while_iter = while_iter + 1

    max_c = -100000
    while_iter = 0
    number_max = 0

    while while_iter < len(mass):
        print mass[while_iter].c
        if mass[while_iter].c > max_c:
            max_c = mass[while_iter].c
            number_max = while_iter
        while_iter = while_iter + 1
    print mass[number_max].name_categories
예제 #20
0
def numero_puntuacion_moens(texto):
    regexp = "[/,$?:;!()&%#=+{}*~.]+"
    tokens = nltk.regexp_tokenize(texto, regexp)
    total = len(tokens)
    print len(tokens)    
    print tokens    
    return total
예제 #21
0
def word_couple_con_puntuacion_pares_minusculas(lista):
    word_couples = []
    
    
    regexp = "[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+-*[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+|[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+|[.]+|[/,$?:;!()&%#=+{}*~.]+|[0-9]+"
    
    for oracion in lista:
        
        #oracion = str(oracion)
        #oracion = oracion.to_lower
        #print oracion
        
        
        tokens = nltk.regexp_tokenize(oracion.lower(), regexp)
        #print len(tokens)
        
#         tokens_lower = []
#         for i in range(len(tokens)):
#             palabra = str(tokens[i])
#             tokens_lower.append(palabra.to_lower() )          
            
        
        pairs = list(itertools.permutations(tokens, 2))
        for pair in pairs:
            word_couples.append(pair[0]+"~"+pair[1])
        
    return word_couples
예제 #22
0
파일: 9.py 프로젝트: Anastasia1302/nltk
def tokenize_punctuation(t):
	"""Tokenizes the punctuation in a text 't'."""
	pattern = r'''(?x)			# set to be verbose
	\W 						# searches for non-alphanumeric characters.
	'''
	matches = nltk.regexp_tokenize(t, pattern)
	return matches
예제 #23
0
 def extract(self, text):
     ''' Extract and freudify noun phrases from text, return all succesfully
     freudified noun phrases. '''
     
     toks = nltk.regexp_tokenize(text, self.sentence_re)
     postoks = nltk.tag.pos_tag(toks)
     tree = self.chunker.parse(postoks)
     terms = self._get_terms(tree)
     
     phrases = sets.Set()
     
     # Loop through all the noun phrases and try to freudify them.
     for term in terms:
         if (len(term)) < 2: continue
         changed = False
         context = ""
         phrase = []
         for part in term:
             word, tag = part
             word = word.encode('ascii', 'replace')
             phrase.append(word.lower())
             rpl = self.replace_word(tag[:2], word)
             if len(rpl[2]) > 0:
                 context = rpl[2]
                 phrase[-1] = rpl[0]
                 changed = True
         if changed:
             phrase = " ".join(phrase).strip()
             phrase.encode('ascii', 'replace')
             phrase = str(phrase)
             if phrase not in self.own_phrases[context]:
                 phrases.add((str(phrase), context))    
       
     phrases = list(phrases)      
     return phrases
예제 #24
0
def generate_vocab(papers):
    """Returns the vocabulary used in the papers given in parameters, after cleaning and stopwords removal.

    Args:
        papers (list of tuples): the raw list of papers from which generates the vocabulary (each element is a tuple of 3 strings: id, title and abstract)

    Returns:
        list of strings: the list of tokens forming the vocabulary
    """
    sc = StringCleaner()

    # Generate author's vocabulary
    corpus = " ".join(p[1] + " " + p[2] for p in papers)
    # Cleaning
    corpus = sc.clean_string(corpus)
    # Tokenization
    pattern = r"(?:[A-Z]\.)+|\w+(?:-\w+)*|\d+(?:\.\d+)?%?"
    #         we keep tokens that are words (with optional internal hyphens), acronyms and percentages
    tokens = set(nltk.regexp_tokenize(corpus, pattern)) - set(nltk.corpus.stopwords.words("english"))
    num_re = re.compile("^\d+$")
    tokens = set([t for t in tokens if not num_re.match(t)]) # we remove only-numeric tokens
    # Stemming
    porter = nltk.stem.PorterStemmer()

    return [porter.stem(t) for t in tokens]
def AO_lTokenize(AO_sText):


    '''
        This brreakes a text into individual words
        Adapted From Natural Language Processing with Python
    '''
    regex = r'''(?xi)
    (?:H|S)\.\ ?(?:(?:J|R)\.\ )?(?:Con\.\ )?(?:Res\.\ )?\d+ # Bills
  | ([A-Z]\.)+                                              # Abbreviations (U.S.A., etc.)
  | ([A-Z]+\&[A-Z]+)                                        # Internal ampersands (AT&T, etc.)
  | (Mr\.|Dr\.|Mrs\.|Ms\.)                                  # Mr., Mrs., etc.
  | \d*\.\d+                                                # Numbers with decimal points.
  | \d\d?:\d\d                                              # Times.
  | \$?[,\.0-9]+\d                                          # Numbers with thousands separators, (incl currency).
  | (((a|A)|(p|P))\.(m|M)\.)                                # a.m., p.m., A.M., P.M.
  | \w+((-|')\w+)*                                          # Words with optional internal hyphens.
  | \$?\d+(\.\d+)?%?                                        # Currency and percentages.
  | (?<=\b)\.\.\.(?=\b)                                     # Ellipses surrounded by word borders
  | [][.,;"'?():-_`]
    '''
    # Strip punctuation from this one; solr doesn't know about any of it
    tokens = regexp_tokenize(AO_sText, regex)
    # tokens = [re.sub(r'[.,?!]', '', token) for token in tokens]  # instead of this we just test word length
    return tokens
예제 #26
0
def handle_doc(word_set,rs_path):
    doc_dir = os.listdir(rs_path)
    doc_matrix = []
    doc_cat = []
    for docs in doc_dir:
        files = os.listdir(rs_path+docs)
        print "start to handle the -->  "+docs
        for file_d in files:
            d_path = rs_path+docs+'/'+file_d
            #get the single file path
            with open(d_path,'rb') as text_file:
                str_tmp = ''
                file_lines = text_file.readlines()
                for line in file_lines:
                    pattern = r'''[a-zA-Z]+'''
                    tokens = nltk.regexp_tokenize(line,pattern)
                    for t in tokens:
                        if t.lower() in word_set:
                            str_tmp += t.lower()
                            str_tmp += ' '
                doc_matrix.append(str_tmp)
                doc_cat.append(cat_dic[docs])
            text_file.close()
    str_tmp = ''
    for sw in word_set:
        str_tmp += sw
        str_tmp += ' '
    doc_matrix.append(str_tmp)
    doc_cat.append('NAN')
    vectorizer = CountVectorizer()
    doc_num = vectorizer.fit_transform(doc_matrix)
    tfidf = TfidfTransformer()
    doc_tfidf = tfidf.fit_transform(doc_num)
    return doc_tfidf[:-1,:],doc_cat[:-1]
def word_segment(data, mark_stop, english_stop):
    """
    分词并去除停用词
    :param data:
    :param stopwords_list:
    """

    """
    segment_text = nltk.word_tokenize(data.replace('.', ' '))
    segment_text = [word.lower() for word in segment_text if word.lower() not in (english_stop + mark_stop)]
    segment = nltk.pos_tag(segment_text)  #词性标注
    """

    pattern = r"""(?x)([A-Z]\.)+|\w+(-\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;'"?():-_`]"""
    segment_text = nltk.regexp_tokenize(data, pattern)
    # 可选择取词干
    # porter = nltk.PorterStemmer()
    segment_text = [t.lower() for t in segment_text if t.lower() not in (english_stop + mark_stop)]
    segment = nltk.pos_tag(segment_text)  # 词性标注

    segment_list = []
    for item in segment:
        segment_list.append(item[0] + "," + item[1])

    return segment_list
def tokenize_tag_text(description):
    """Removes some punctuation, tags each word by part-of-speech, and generates keyword and 
    keyword prhases  based on noun phrases patterns using regexp."""

    sentence_re = r'''(?x)
    ([A-Z])(\.[A-Z])+\.?  # set flag to allow verbose regexps
    | \w+(-\w+)*          # words with optional internal hyphens
    | \$?\d+(\.\d+)?%?    # currency and percentages
    | \.\.\.              # ellipsis
    | [][.,;"?():-_`]     # separate tokens
    '''

    grammar = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}             # Nouns and Adjectives, terminated with Nouns
        {<NNP|NNPS>+<IN>?<NNP|NNPS>+}  # A sequence of proper nouns connected with zero or more prepositions
        {<DT|PP\$>?<JJ>*<NN|NNS>}      # Determiners (e.g. 'the', 'a') or possessive, followed by one or more adjective 
        {<NN>+}                        # A sequence of one or more nouns

    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  
    """

    chunker = nltk.RegexpParser(grammar)
    toks = nltk.regexp_tokenize(description, sentence_re)
    postoks = nltk.tag.pos_tag(toks)
    tree = chunker.parse(postoks)
    return tree 
예제 #29
0
 def compute_df(self, document_list):
     '''Compute document frequency based on input document list'''  
     df_cache = dict()
     df_output = dict()
     
     d_index = 0
     for document in document_list:
         d_index += 1
         # tokenize each document
         reg_toks = nltk.regexp_tokenize(document, SENTENCE_RE)
         for item in reg_toks:
             # change each word to lower case and lemmatize
             item = normalise(item)
             if item not in df_cache:
                 df_cache[item] = set([d_index])
             else:
                 df_cache[item].add(d_index)
     
     for item in df_cache:
         if acceptable_word(item):
             df_output[item] = len(df_cache[item])
     
     df_output['total_document'] = len(document_list)
     
     return df_output
예제 #30
0
        def parse(self, response):
	    fd = nltk.FreqDist()
	    punct = set(string.punctuation)
	    self.i += 1
	    titles = Selector(response=response).xpath('//title/text()').extract()
            filename = response.url.split("/")[-2]
	    filedir = dirs[self.i - 1] + '/' + filename
	    filedir = 'Top' + '/' + filedir.split('/')[1] + '-' + filedir.split('/')[2]
	    print dirs[self.i - 1]
	    print filedir
	    temp = stripAllTags(response.body)
	    s = MLStripper()
	    s.feed(temp)
	    pure_body = s.get_data()
	    pure_body = pure_body.lower()
	    pure_body = unicodedata.normalize('NFKD', pure_body).encode('ASCII', 'ignore')
	    for word in nltk.regexp_tokenize(pure_body, pattern=r'\.|(\s+)', gaps = True):
		if word not in punct and word not in common_words:
		    fd.inc(word)
	    freq_tuples = fd.items()
	    if not(os.path.exists(filedir)):
		os.makedirs(filedir)
	    filedir = filedir + '/' + filename
            with open(filedir, 'wb') as f:
		f.write('@attribute ' + filedir.split('/')[1] + ' {0,1}\n\n' + '@data\n')
		f.write('\n')
		for title in titles:
		    f.write(title.encode('utf-8').strip())
		f.write('\n')
                for item in freq_tuples:
		    i = 0
		    while i < item[1]:
			i+=1
			f.write(item[0] + '\n')
예제 #31
0
def URLDECODE(XSS):

    XSS = XSS.lower()
    XSS = unquote(unquote(XSS))
    XSS, num = re.subn(r'\d+', "0", XSS)

    XSS, num = re.subn(r'(http|https)://[a-zA-Z0-9\.@&/#!#\?]+', "http://u",
                       XSS)

    r = '''
        (?x)[\w\.]+?\(
        |\)
        |"\w+?"
        |'\w+?'
        |http://\w
        |</\w+>
        |<\w+>
        |<\w+
        |\w+=
        |>
        |[\w\.]+
    '''
    return nltk.regexp_tokenize(XSS, r)
예제 #32
0
def get_docs(file_name):
	in_put = open(file_name, 'rU')
	raw = in_put.readlines()

	#保留有摘要的paper 序号
	paper_id = [raw.index(w) for w in raw if w != 'null\n']
	raw = [w.lower() for w in raw if w != 'null\n']

	docs = [nltk.regexp_tokenize(w, pattern) for w in raw]

	#只保留英文单词,有-的会被删掉,删除停用词
	for i in xrange(len(docs)):
		docs[i] = [w for w in docs[i] if w.isalpha() and w not in stop_word]

	wnl = nltk.WordNetLemmatizer()

	#词形还原
	for i in xrange(len(docs)):
		docs[i] = [wnl.lemmatize(t) for t in docs[i]]

	in_put.close()

	return docs
예제 #33
0
def computeSentiment(text):
    # Tokenize and remove stop words
    tokens = []
    for t in nltk.regexp_tokenize(text.lower(), '[a-z]+'):
        if t not in sr:
            tokens.append(t)
    tokens[:10]

    # Count the number of positive and negative words.
    pos_count = 0
    neg_count = 0
    for t in tokens:
        if t in pos_words:
            pos_count += 1
        elif t in neg_words:
            neg_count += 1

    # Compute sentiment
    if (pos_count + neg_count) > 0:
        sentiment = float(pos_count - neg_count) / float(pos_count + neg_count)
    else:
        sentiment = 0
    return sentiment
예제 #34
0
def GeneSeg(payload):
    #数字泛化为"0"
    payload = payload.lower()
    #payload=unquote(unquote(payload))   这里已经解过码了,所以不用再解码了
    payload, num = re.subn(r'\d+', "0", payload)
    #替换url为”http://u
    payload, num = re.subn(r'(http|https)://[a-zA-Z0-9\.@&/#!#\?]+',
                           "http://u", payload)
    #分词
    r = '''
        (?x)[\w\.]+?\(
        |\)
        |"\w+?"
        |'\w+?'   
        |http://\w
        |</\w+>
        |<\w+>
        |<\w+
        |\w+=
        |>
        |[\w\.]+
    '''
    return nltk.regexp_tokenize(payload, r)
예제 #35
0
    def text_parse(cls, x):
        try:
            sentence = x.strip().lower()
        except:
            sentence = x

        sentence = re.sub(cls.hndl_regex, cls.hndl_repl, sentence)  # 匹配替换@***
        sentence = re.sub(cls.hash_regex, cls.hash_repl, sentence)  # 匹配替换#***
        sentence = re.sub(cls.url_regex, cls.url_repl, sentence)  # 匹配替换URL
        sentence = re.sub(cls.rpt_regex, cls.rpt_repl, sentence)  # 匹配替换类似yoooooooo为yoo

        emoticons_regex = [(repl, re.compile(cls.regex_union(cls.escape_paren(regx)))) for (repl, regx) in
                           cls.emoticons]  # 匹配替换表情
        for (repl, regx) in emoticons_regex:
            sentence = re.sub(regx, ' ' + repl + ' ', sentence)

        pattern = r""" (?x)(?:[a-z]\.)+ 
                        | \d+(?:\.\d+)?%?\w+
                        | \w+(?:[-']\w+)*
                        | (?:[-.!?]{2,})
                        | [][.,;"'?():$-_*`]"""
        word_list = nltk.regexp_tokenize(sentence, pattern)
        return word_list
def AO_lTokenize(AO_sText):
    '''
        This brreakes a text into individual words
        Adapted From Natural Language Processing with Python
    '''
    regex = r'''(?xi)
    (?:H|S)\.\ ?(?:(?:J|R)\.\ )?(?:Con\.\ )?(?:Res\.\ )?\d+ # Bills
  | ([A-Z]\.)+                                              # Abbreviations (U.S.A., etc.)
  | ([A-Z]+\&[A-Z]+)                                        # Internal ampersands (AT&T, etc.)
  | (Mr\.|Dr\.|Mrs\.|Ms\.)                                  # Mr., Mrs., etc.
  | \d*\.\d+                                                # Numbers with decimal points.
  | \d\d?:\d\d                                              # Times.
  | \$?[,\.0-9]+\d                                          # Numbers with thousands separators, (incl currency).
  | (((a|A)|(p|P))\.(m|M)\.)                                # a.m., p.m., A.M., P.M.
  | \w+((-|')\w+)*                                          # Words with optional internal hyphens.
  | \$?\d+(\.\d+)?%?                                        # Currency and percentages.
  | (?<=\b)\.\.\.(?=\b)                                     # Ellipses surrounded by word borders
  | [][.,;"'?():-_`]
    '''
    # Strip punctuation from this one; solr doesn't know about any of it
    tokens = regexp_tokenize(AO_sText, regex)
    # tokens = [re.sub(r'[.,?!]', '', token) for token in tokens]  # instead of this we just test word length
    return tokens
def talk_to_bot():
    vocab = chatbot.read_vocab()
    vectors = chatbot.read_vectors()

    # 문제
    # 질문과 대답 데이터에서 가장 긴 토큰의 길이를 구하세요.
    dialog_questions = vectors[::2]
    dialog_answers = vectors[1::2]

    max_len_q = max([len(q) for q in dialog_questions])
    max_len_a = max([len(a) for a in dialog_answers]) + 1
    print(max_len_q, max_len_a)  # 9 10

    # ------------------------------------------------ #

    model = tf.keras.
    onehot = np.eye(len(vocab), dtype=np.float32)

    while True:
        sys.stdout.write('왕자: ')
        sys.stdout.flush()

        line = sys.stdin.readline()
        line = line.strip()

        if '끝' == line:
            break

        # 문제
        # 입력 문장을 토큰으로 분리하세요.
        # tokens = line.split()         # 공백이 여러 개 일때(?)
        tokens = nltk.regexp_tokenize(line, r'\w+')
        # print(tokens)                 # ['이리', '와서', '나하고', '놀자']

        # 문제
        # 토큰을 질문으로 변환하세요. (문자열 토큰을 숫자로 변환하세요)
        question = [vocab.index(t) if t in vocab else chatbot._UNK_ for t in tokens]
def analyze2(text2):
    # takes a list of comment strings and tokenizes and finds pairs of positive and negative words with specific phone features
    new=[]
    tokens=[]
    count=0
    negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
    with open("positive-words.txt",'r') as f:
        positive_words=[line.strip() for line in f]
    with open("negative-words.txt",'r') as f:
        negative_words=[line.strip() for line in f]
    positive_tokens=[]
    negative_tokens=[] #N is 1 this time, negating word right before pos or neg word
    reviewpos=[]
    reviewneg=[]
    features=["headphones", "battery", "sound", "charge", "screensize", "size", "space", "storage", "camera", "speed", "display", "sensor", "casing", "price"]
    for text in text2:
        #text=text.strip(string.punctuation)
        #text=text.strip(" ")
        #tokens=nltk.word_tokenize(text)
        #tokens = re.split(r"\W+", text)
        pattern=r'\w[\w\'-]*\w'      
        tokens=nltk.regexp_tokenize(text, pattern)
        tokens=[tokens.lower() for tokens in tokens]
        #tokens=[token.strip(string.punctuation) for token in tokens]
        #tokens=[token.strip() for token in tokens if token.strip()!='']
        new.append(tokens) #change += to not have seperated list per comment
        count+=1
    for x in new:
        for i in range(0, len(x)):
            previ=""
            if i>0:
                previ=x[i-1]
            if previ in positive_words and x[i] in features:
                    reviewpos.append((previ,x[i]))
            if previ in negative_words and x[i] in features:
                    reviewneg.append((previ,x[i]))
    return reviewpos, reviewneg
def tknse(s):
    """
    Tokenises a sentence string in a suitable to way to analyse
    both bible text and twitter data 
    (e.g. catching and filtering out mentions, URLs, ...)
    """

    import string
    from nltk import regexp_tokenize

    # define pattern for regexp
    pattern = [
        r'<[^>]+>',  # HTML tags (drop)
        r'(?:@[\w_]+)',  # @-mentions (catch and filter drop)
        #r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags (keep as words)
        r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',  # URLs (catch and filter)
        r'(?:(?:\d+,?)+(?:\.?\d+)?)',  # numbers (drop)
        r'(?:[A-Z]\.)+',  # abbreviations, e.g. U.S.A.
        #r'\$?\d+(?:\.\d+)?%?', # currency and percentages, e.g. $12.40, 82% (leave as numbers)
        r'(?:[\w_]+)',  # other words
        r'(?:\S)'  # anything else
    ]
    pattern = r'(' + '|'.join(
        pattern) + ')'  # collapse into single regex string
    tok = regexp_tokenize(s, pattern)  # tokenise

    # filter out unwanted tokens
    tok = list(
        filter(
            lambda w: (w[0] not in ['@']) and  # @-mentions
            (w[0:4].lower() != 'http') and  # URLs
            (w.replace('.', '', 1).isdigit() == False) and  # numbers
            (w not in string.punctuation),
            tok))
    # to lower case
    tok = [w.lower() for w in tok]  # lower case only
    return (tok)
예제 #40
0
def text_parse(input_text, language='en'):

    sentence = input_text.strip().lower()
    sentence = re.sub(r'@\s*[\w]+ | ?#[\w]+ | ?&[\w]+; | ?[^\x00-\xFF]+', '',
                      sentence)
    special_tag = {
        '.', ',', '#', '!', '(', ')', '*', '`', ':', '"', '‘', '’', '“', '”',
        '@', ':', '^', '/', ']', '[', ';', '=', '_'
    }
    pattern = r""" (?x)(?:[a-z]\.)+ 
                  | \d+(?:\.\d+)?%?\w+
                  | \w+(?:[-']\w+)*"""
    word_list = regexp_tokenize(sentence, pattern)

    filter_word = []
    if language == 'en':
        filter_word = [
            w for w in word_list
            if w not in stopwords.words('english') and w not in special_tag
        ]  # 去停用词和特殊标点符号
    word_tag = pos_tag(
        filter_word, tagset=None,
        lang=language)  # 词性标注,返回标记列表[('Codeine', 'NNP'), ('15mg', 'CD')

    res_word_list = []
    lemmatizer = WordNetLemmatizer()  # 词形还原
    tag_list = {
        'TO', 'RB', 'RBR', 'RBRS', 'UH', 'WDT', 'WP', 'WP$', 'WRB', 'SYM',
        'RP', 'PRP', 'PRP$', 'CD', 'POS', ':'
    }
    for i in range(0, len(word_tag)):  # 去掉副词、介词、小品词、疑问词、代词、人称代词、所有格代名词等
        if word_tag[i][1] in tag_list:
            continue
        else:
            word = lemmatizer.lemmatize(word_tag[i][0])
            res_word_list.append(word)
    return res_word_list
예제 #41
0
def tokenExtractor(file):
    doc = xml.dom.minidom.parse(file)

    movieText = ""

    for item in doc.getElementsByTagName("s"):
        for child in item.childNodes:
            if child.nodeName == "#text" and len(re.findall("\w", child.nodeValue)) > 1:
                movieText += child.nodeValue

    movieText = re.sub("\n\s+", " ", movieText)
    movieText = re.sub("\n", "", movieText)

    pattern = r'''(?x)(?:[A-Z]\.)+ | \w+(?:-\w+)*  | '''
    tokens = nltk.regexp_tokenize(movieText, pattern)

    # Remove non alphanumericbetic characters
    tokens = [w for w in tokens if re.search(r'\w', w)]

    # Remove stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    tokens = [w for w in tokens if w.lower() not in stopwords]
    stopwords = nltk.corpus.stopwords.words('spanish')
    tokens = [w for w in tokens if w.lower() not in stopwords]
    stopwords = nltk.corpus.stopwords.words('french')
    tokens = [w for w in tokens if w.lower() not in stopwords]
    stopwords = nltk.corpus.stopwords.words('italian')
    tokens = [w for w in tokens if w.lower() not in stopwords]

    # Remove numbers
    tokens = [w for w in tokens if not re.search(r'\d', w)]

    # Lower case
    tokens = [t.lower() for t in tokens]
    tokens = [item for item in tokens if item.isalpha()]

    return tokens
예제 #42
0
def get_pos_tags(text):
    """Used when tokenizing words"""
    text = tostring(text)
    regex_patterns = r"""(?x)      # set flag to allow verbose regexps
          (?:[A-Z]\.)+  # abbreviations, e.g. U.S.A.
        | \w+(?:-\w+)*            # words with optional internal hyphens
        | \$?\d+(?:\.\d+)?%?      # currency and percentages, e.g. $12.40, 82%
        | \.\.\.                # ellipsis
        | [][.,;"'?():-_`]      # these are separate tokens
    """
    # POS tagging
    # postoks = nltk.pos_tag(text.split())
    toks = nltk.regexp_tokenize(text, regex_patterns)
    assert isinstance(toks, list), "toks is not a list of str, cannot tokenize."
    postoks = nltk.tag.pos_tag(toks)
    # fix a weird pos-tagging error in NLTK
    prior_pos = ''
    for i in range(0, len(postoks)):
        if prior_pos == 'TO' and 'VB' not in postoks[i][1]:
            old = postoks.pop(i)
            postoks.insert(i, (old[0], 'VB'))
        prior_pos = postoks[i][1]
    # print('getPOStags_returns:', postoks)
    return postoks
예제 #43
0
def keyphrase_sentence(sentence):
    sentence_re = r'''(?x)          
      (?:[A-Z]\.)+             
    | \w+(?:-\w+)*        
    | \$?\d+(?:\.\d+)?%?
    | \.\.\.              
    | [][.,;"'?():_`-]    
    '''

    toks = nltk.regexp_tokenize(sentence.lower(),
                                sentence_re)  # sentence tokenisation

    postoks = nltk.tag.pos_tag(toks)

    for i in range(len(postoks)):
        if postoks[i][1][0] == 'N' or postoks[i][1] == 'RB' or postoks[i][
                1] == 'DT':  # check if 'N'/'RB'/'DT' is appearing in pos list.
            token_ls = toks[i:len(
                toks)]  ## span of keyphrase ( starting point - 'N'/'RB'/'DT',
            ## ending point - ending of that sentence )
            token_ls = [i for i in token_ls if i not in stop_word_ls
                        ]  # remove stopwords from phrases.
            if len(token_ls) >= 3:
                return " ".join(token_ls)
예제 #44
0
def test_handle(word_list,tr_path):
    docdir_list = os.listdir(tr_path)
    test_m = []
    test_cat = []
    for dd in docdir_list:
        file_list = os.listdir(tr_path+dd)
        print "handling the---> "+dd+" <---directory.."
        for fpath in file_list:
            d_path = tr_path + dd + '/' + fpath
            with open(d_path,"rb") as text_file:
                str_tmp = ''
                test_cat.append(cat_dic[dd])
                fl = text_file.readlines()
                test_por = nltk.PorterStemmer()
                for doc_line in fl:
                    pattern = r'''[a-zA-Z]+'''
                    tokens = nltk.regexp_tokenize(doc_line,pattern)
                    for t in tokens:
                        if t.lower() in word_list:
                            str_tmp += t.lower()
                            str_tmp += ' '
                test_m.append(str_tmp)
            text_file.close()
    #最后增加一维把所有的wordset加进去?!try once!
    str_tmp = ''
    for sw in word_list:
        str_tmp += sw
        str_tmp += ' '
    test_m.append(str_tmp)
    test_cat.append(10)
    vectorizer = CountVectorizer()
    doc_m = vectorizer.fit_transform(test_m)
    tfidf = TfidfTransformer()
    test_matrix = tfidf.fit_transform(doc_m)
    #test_matrix = log_sparsematrix(test_matrix)
    return test_matrix,test_cat
def get_clean_text_pattern(recomposed_note):
    """Function that filters through the notes, retrieves those that match
     the specified pattern and removes stopwords."""
    pattern = "([a-zA-Z0-9\\\]+(?:'[a-z]+)?)"
    recomposed_note_raw = nltk.regexp_tokenize(recomposed_note, pattern)
    # Create a list of stopwords and remove them from our corpus
    stopwords_list = stopwords.words('english')
    stopwords_list += list(string.punctuation)
    # additional slang and informal versions of the original words had to be added to the corpus.
    stopwords_list += ([
        "im", "ur", "u", "'s", "n", "z", "n't", "brewskies", "mcd’s", "Ty$",
        "Diploooooo", "thx", "Clothessss", "K2", "B", "Comida", "yo", "jobby",
        "F", "jus", "bc", "queso", "fil", "Lol", "EZ", "RF", "기프트카드", "감사합니다",
        "Bts", "youuuu", "X’s", "bday", "WF", "Fooooood", "Yeeeeehaw", "temp",
        "af", "Chipoodle", "Hhuhhyhy", "Yummmmers", "MGE", "O", "Coook",
        "wahoooo", "Cuz", "y", "Cutz", "Lax", "LisBnB", "vamanos", "vroom",
        "Para", "el", "8==", "bitchhh", "¯\\_(ツ)_/¯", "Ily", "CURRYYYYYYY",
        "Depósito", "Yup", "Shhhhh"
    ])

    recomposed_note_stopped = ([
        w.lower() for w in recomposed_note_raw if w not in stopwords_list
    ])
    return recomposed_note_stopped
예제 #46
0
def processPreDiffCode(code):
    code = re.sub(r'(\"[\s\S]*?\")', '', code, 0, re.I)
    code = re.sub(r'(@@[\s\S]*?\n)', '', code, 0, re.I)
    code = re.sub(r'(\+[\s\S]*?\n)', '', code, 0, re.I)
    result = []
    mis = methodInvocationCase.findall(code)
    for mi in mis:
        miWords = mi.split('\.')
        for miWord in miWords:
            toDeal = []
            if camelCase1.match(miWord) or camelCase2.match(miWord):
                toDeal = splitCode(miWord)
            elif upperExtCase.match(miWord):
                toDeal = splitFinalExt(miWord)
            elif upperCase.match(miWord):
                toDeal.append(miWord)
            for deal in toDeal:
                if not isDelete(deal.lower()):
                    result.append(stemmer.stem(deal))

    code = re.sub(r'([A-Za-z0-9_]+\.[A-Za-z0-9_]+)', '', code, 0, re.I)
    sentences = tokenizer.tokenize(code)
    for sentence in sentences:
        words = nltk.regexp_tokenize(sentence, pattern)
        for word in words:
            toDeal = []
            if camelCase1.match(word) or camelCase2.match(word):
                toDeal = splitCode(word)
            elif upperExtCase.match(word):
                toDeal = splitFinalExt(word)
            elif upperCase.match(word):
                toDeal.append(word)
            for deal in toDeal:
                if not isDelete(deal.lower()):
                    result.append(stemmer.stem(deal))
    return result
예제 #47
0
def extract_clean_text(json_file):
    wv = []
    cnt = 0
    stoplist = load_stoplist()
    wordnet_lemmatizer = WordNetLemmatizer()
    with open(json_file, 'r') as json_file:
        user_tweets = json.load(json_file)
        for user in user_tweets:
            text = ''
            for tweet in user_tweets[user]:
                text += common.cleanhtml(
                    common.remove_hashtag_sign(
                        common.remove_username(
                            common.remove_url(ftfy.fix_text(tweet))))) + ' '
            # clean_texts = [wordnet_lemmatizer.lemmatize(word.lower()) for word in nltk.regexp_tokenize(text, pattern)]
            clean_texts = [
                wordnet_lemmatizer.lemmatize(word.lower())
                for word in nltk.regexp_tokenize(text, pattern)
                if wordnet_lemmatizer.lemmatize(word.lower()) not in stoplist
            ]
            wv.append(clean_texts)
            cnt += 1
    logger.info('total tweets: %d;' % cnt)
    return wv
예제 #48
0
def tf_text(text_title_summary_reviews, docID):
    """ Returns a list of filtered terms: (term, (docID, tf/sqrt(len(keywords)))) """

    pattern = r'''(?x)              # set flag to allow verbose regexps
            aujourd'hui             # exception 1
            | prud'hom\w+           # exception 2
            | \w'                   # contractions d', l', j', t', s'
            | \d+(?:,\d+)?%?€?      # currency and percentages, e.g. 12,40€, 82%        
            | (?:[A-Z]\.)+          # abbreviations, e.g. U.S.A.
            | \w+(?:-\w+)*          # words with optional internal hyphens
            #| [][.,;"'?():_`-]     # these are separate tokens; includes ], [
        '''

    words = nltk.regexp_tokenize(text_title_summary_reviews.lower(), pattern)

    keywords = []
    fdist = FreqDist()

    for elt in words:
        if elt[0] in LOADED_LEMMA:
            try:  # on prend le 1e lemma possible meme si ça peut etre faux (ex: abstrait -> abstraire (verbe))
                lemma = [x[0] for x in LOADED_LEMMA[elt[0]]
                         if x[0][0] == elt][0][1]
            except:
                with open("backend/language/lemma/missing.txt", "a") as f:
                    f.write(unidecode.unidecode(elt) + "\n")
                lemma = elt

            if not lemma in stopwords:
                keywords.append(lemma)

    fdist = FreqDist(keywords)
    result = [(x[0], (docID, (1 + log10(x[1])) / sqrt(len(keywords))))
              for x in fdist.items()]

    return result
def pos(text):
    sentence_re = r'''(?x)      
	      ([A-Z])(\.[A-Z])+\.?  
	    | \w+(-\w+)*           
	    | \$?\d+(\.\d+)?%?      
	    | \.\.\.                
	    | [][.,;"'?():-_`]      
	'''

    lemmatizer = nltk.WordNetLemmatizer()
    stemmer = nltk.stem.porter.PorterStemmer()

    grammar = r"""
	    NBAR:
	        {<NN.*|JJ>*<NN.*>}  
	        
	    NP:
	        {<NBAR>}
	        {<NBAR><IN><NBAR>} 
	"""
    chunker = nltk.RegexpParser(grammar)

    toks = nltk.regexp_tokenize(text, sentence_re)
    postoks = nltk.tag.pos_tag(toks)

    #print postoks
    for word, tag in postoks:
        #print word,tag
        if (tag == "NN" or tag == "JJ"):
            count += 1
    if (count >= 2):
        print "Inside"
        return true
    #tree = chunker.parse(postoks)
    #terms=get_terms(tree)
    return false
def compute_imprs_word_counts(file_names):
    texts = extract_texts(file_names)
    dicts = [extract_dict(t) for t in texts]
    imprs = [d['IMPRESSION'] for d in dicts]
    # for split
    #adapted from https://stackoverflow.com/a/22178786/1469195
    # (removed capturing groups)
    pattern = r'''(?x)               # set flag to allow verbose regexps
                  (?:[A-Z]\.)+         # abbreviations, e.g. U.S.A.
                  | \$?\d+(?:\.\d+)?%? # numbers, incl. currency and percentages
                  | \w+(?:[-']\w+)*    # words w/ optional internal hyphens/apostrophe
                  | [+/\-@&*]        # special characters with meanings
                '''
    words = regexp_tokenize("\n".join(imprs), pattern)
    words = clean_words(words)
    counter = Counter(words)
    result = namedtuple('WordResult', ['counter',
                                       'imprs',
                                       'words'], verbose=False)(
        counter=counter,
        imprs=imprs,
        words=words,
    )
    return result
예제 #51
0
    def __init__(self, dbfile, colText, colCnt, min_support=.01):
        timer = Timer()

        self.min_support = min_support

        dbSize = 0
        vocab = {}
        itemset = []
        texts = []
        ## load data, tokenize the text, hash vocabulary
        f = open(dbfile, 'rU')
        rdr = csv.reader(f, delimiter='\t')
        fdist = nltk.probability.FreqDist()
        for r in rdr:
            text = unicode(r[colText], 'utf-8')
            tokens = nltk.regexp_tokenize(text, tokenPattern)
            if colCnt < 0:
                num = 1
            else:
                num = int(r[colCnt])
            text = []
            for t in tokens:
                if not t in stopwords:
                    if not t in vocab:
                        vocab[t] = len(itemset)
                        itemset.append(t)
                    text.append(vocab[t])
            if len(text) > 0:
                texts.append((text, num))
            dbSize += num
        self.dbSize = dbSize
        self.vocab = vocab
        self.itemset = itemset
        self.texts = texts
        f.close()
        timer.printElapsed()
def limpiar_texto(texto):
  '''En esta función el texto se va a tokenizar pero a partir de una expresión regular '''
  spanishstemmer=SnowballStemmer('spanish')

  pattern = r'''(?x)                 #set flag to allow verbose regexps
              (?:[A-Z]\.)+          #abbreviations, e.g. U.S.A.
              | \w+(?:-\w+)*        #words with optional internal hyphens(guiones internos)
              | \$?\d+(?:\.\d+)?%?  #currency(dinero) and percentages, e.g. $12.40, 82%
              
  ''' 
  #Definiendo stop words(palabras de interrupción)
  stop_words = set(stopwords.words('spanish'))
  #convertir a minusculas
  texto = texto.lower()  

  #aplicando tokenización por medio de la expresión regular
  texto_tokenizado = nltk.regexp_tokenize(texto,pattern)

  #quitando palabras de intrerrupción (cerradas)
  words = [w for w in texto_tokenizado if not w in stop_words]

  #convirtiendo palabras en raices
  stems = [spanishstemmer.stem(token) for token in words]
  return stems
예제 #53
0
    def pos_tag_sent(self, sent, boi_form=True):
        """
        Tag sentence
        convert sentence from BOI-form to String-form and tag it using the provided tagger
        :param sent: boi sentence
        :param pos_tagger: tagger to tag the sentence
        :return:
        """
        if boi_form:
            untagged_sent = self.convert_from_boi_to_sent(sent)
        else:
            untagged_sent = sent

        tokens = nltk.regexp_tokenize(untagged_sent, pattern=" ", gaps=True)
        pos_tagged_sent = self.tagger.tag(tokens)

        result = []
        for i, (_, word) in enumerate(pos_tagged_sent):
            r = word.split('/')
            if len(r) == 3:
                r = ['/', r[-1]]
            result.append((tuple(r), sent[i][1]))

        return result
예제 #54
0
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()

#Taken from Su Nam Kim Paper...
grammar = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
"""
chunker = nltk.RegexpParser(grammar)

toks = nltk.regexp_tokenize(text, sentence_re)
postoks = nltk.tag.pos_tag(toks)

print(postoks)

tree = chunker.parse(postoks)

from nltk.corpus import stopwords
stopwords = stopwords.words('english')


def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter=lambda t: t.label == 'NP'):
        yield subtree.leaves()
예제 #55
0
파일: ls1.py 프로젝트: ntabgoba/nltk1
tokens = nltk.word_tokenize(raw)
[porter.stem(t) for t in tokens]

#Lemmatization -WordNet lemmatizer 
wnl = nltk.WordNetLemmatizer()  # if you want to compile the vocabulary of some texts and want a list of valid lemmas
[wnl.lemmatize(t) for t in tokens]

#3.7 Regular Expresssions for Tokenizing Text
# simple approach - split on whitespace
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
            well without--Maybe it's always pepper that makes people hot-tempered,'..."""
re.split(r' ',raw)
re.split(r'[ \t\n]+', raw)  #matches one or more spaces i.e tabs, newlines
re.split(r'\W+', raw)  #\W split input on anything other than a word character \W = [a-zA-Z0-9]

nltk.regexp_tokenize()  # is more efficient for this task,
text = 'That U.S.A. poster-print costs $12.40..'

pattern = r'''(?x)          
    ([A-Z]\.)+          
  | \w+(-\w+)*       
  | \$?\d+(\.\d+)?%?  
  | \.\.\.            
  | [][.,;"'?():-_`]  
'''
nltk.regexp_tokenize(text, pattern)  #didnt work

#Segmentation -Sentence segemantation -word segmentation
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') 
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = sent_tokenizer.tokenize(text)
예제 #56
0
def split_str(line):
    words = nltk.regexp_tokenize(line, tokens_pattern)
    # print(words)
    return words
예제 #57
0
import re

str = "aoweijriow aofejofenr wajioejfo er (!#^ &% etc@ 3 $ $%#) wearjwoaieraw awoerj oawier"

p = re.compile('\(.*\)')
print(p.findall(str))

print("22th".isalnum())

from nltk import regexp_tokenize
txt = "Today it's 07.May 2011. Or 2.999."
print(regexp_tokenize(txt, pattern=r'\w+([.,]\w+)*|\S+'))
예제 #58
0
        trainable = True

if trainable:
    words = []  # this will contain the root words
    labels = []  # this will contain the tags
    docs_patterns = []  # this will contain each pattern list
    docs_labels = [
    ]  # this will contain each tag, but many times to get the amount of the tags

    for intent in data['intents']:
        for pattern in intent['patterns']:
            # Equals the list pattern to words_pattern without punctuation marks
            # ['Hi']
            # ['How', 'are', 'you']
            # ...
            words_pattern = nltk.regexp_tokenize(pattern, "(\d+|\w+)")

            # Extend the list words with the list 'words_pattern'
            # words = ['Hi', 'How', 'are', 'you', 'Is', 'anyone', 'there', ...]
            words.extend(words_pattern)

            # Append the list 'words_pattern' to the list doc_x (not extending)
            # doc_patterns = [['Hi'], ['How', 'are', 'you'], ['Is', 'anyone', 'there']], ... ]
            docs_patterns.append(words_pattern)

            # Adding the tags to the list doc_y
            # doc_labels = ['greeting', 'greeting', 'greeting', 'greeting', 'greeting', 'goodbye', 'goodbye', ...]
            docs_labels.append(intent["tag"])

        # Append each label one time in list labels
        # labels = ['greeting', 'goodbye', 'thanks', ...]
예제 #59
0
 def tokenize(self, formula):
     clean_formula = re.sub(self._REGEX_CLEAN, '', formula)
     clean_formula = re.sub(self._REGEX_LETTERS, '', clean_formula)
     tokens = regexp_tokenize(clean_formula, self._REGEX_TOKEN)
     return tokens
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer  #stemming and lemmatizing
from nltk import regexp_tokenize  #tokenize
import pandas as pd
import csv

wr = open("stemmed_words.txt", "wr")  #write to stemmed word text file
with open('words_alpha.csv', 'r') as csvfile:  #read from csv file
    data = csv.reader(csvfile, delimiter=' ')
    print data
    stemmer1 = PorterStemmer()  #porterStemmer
    lemma = WordNetLemmatizer()  #lemmatizer
    for row1 in data:
        #print (row1)
        row1 = row1[0].replace('\n',
                               '')  #replace occurence of new line with nochar
        x = stemmer1.stem(row1)
        y = lemma.lemmatize(row1)
        row1 = regexp_tokenize(row1, "[\w']+")  #tokenizing
        z = nltk.pos_tag(row1)  #pos_tagging
        #print row1
        print z[0][0], z[0][1], x, y  #,z'''
        wr.write(z[0][0] + " " + z[0][1] + " " + x + " " + y + '\n')  #writing
csvfile.close()  #closing files
wr.close()