Python regexp_tokenize 예제들, nltk.regexp_tokenize Python 예제들

예제 #1

0

파일 보기

파일: naive_bayes.py 프로젝트: Juicechuan/workspace

def bag_of_words(data, label_codebook, feature_codebook, theta):
    """"""
    word_dict = Alphabet()
    stopset = set(stopwords.words('english'))
    for key, value in data.items():
        label_codebook.add(key)
        for doc in value:
            doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+"))
            for word in doc_tokens:
                if word not in stopset:
                    word_dict.add(word)
                    
    all_words = word_dict._label_to_index.keys()
    fdict = FreqDist([w for w in all_words])
    word_feature = fdict.keys()[theta:]
    for word in all_words:
        if word in word_feature:
            feature_codebook.add(word)
    
    instance_list = {}
    for label, document_list in data.items():
        instance_list[label] = []
        for document in document_list:
            vector = np.zeros(feature_codebook.size())
            tokens = set(nltk.regexp_tokenize(document, pattern="\w+"))
            indice = 0
            
            for word in tokens:
                if feature_codebook.has_label(word):
                    indice = feature_codebook.get_index(word)
                    vector[indice] = 1.
            instance_list[label].append(vector)
    return instance_list

예제 #2

0

파일 보기

파일: chainlink_util.py 프로젝트: seemless/chainlink

def get_freqs(text):

    stop_words = nltk.corpus.stopwords.words('english')
    frequencies = defaultdict(int)

    pattern = r'''(?x)    # set flag to allow verbose regexps
                    ([A-Z]\.)+        # abbreviations, e.g. U.S.A.
                    | \w+(-\w+)*        # words with optional internal hyphens
                    | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
                    | \.\.\.            # ellipsis
                    | [][.,;"'?():-_`]  # these are separate tokens
                     '''

    if type(text) == list:
        print 'number of links: '+ str(len(text))
        for t in text:
            content = t['content']
            tokens = nltk.regexp_tokenize(content, pattern)
            for word in tokens:
                if len(word) > 2 and word.lower() not in stop_words:
                    cap = word[0].upper() + word[1:]
                    frequencies[cap] += 1
    else:
        tokens = nltk.regexp_tokenize(text, pattern)
        for word in tokens:
            if len(word) > 2 and word not in stop_words:
                frequencies[word] += 1
    print "frequency size: "+str(len(frequencies))
    return frequencies

예제 #3

0

파일 보기

파일: Chapter3.py 프로젝트: MariaSpyropoulou/NLTK-Book

def load(f=str):
    import re
    files = open(f)
    raw = files.read()
    pattern = re.compile(r"""\$?\d+(\.\d+)?%?    # currency
                             \d+/\d+/\d+         #dates""", re.VERBOSE)
    nltk.regexp_tokenize(raw, pattern)

예제 #4

0

파일 보기

파일: handleSubject.py 프로젝트: cynricshu/ChinaVis2016

def nltkTest():
    s = "russia licenza 8.1.5 U.S."
    res = nltk.regexp_tokenize(s, helper.nltkPattern)
    print(res)

    s = "Saldo vs. Fattura n. 2015/004"
    res = nltk.regexp_tokenize(s, helper.nltkPattern)
    print(res)

예제 #5

0

파일 보기

파일: chapter3.py 프로젝트: hbdhj/python

def regularExpressionTokenizer():
    text = 'That U.S.A. poster-print costs $12.40...'
    pattern = r'''(?x)         # set flag to allow verbose regexps 
            ([A-Z]\.)+        # abbreviations, e.g. U.S.A. 
          | \w+(-\w+)*        # words with optional internal hyphens 
          | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82% 
          | \.\.\.            # ellipsis 
          | [][.,;"'?():-_`]  # these are separate tokens 
    '''
    print nltk.regexp_tokenize(text, pattern)

예제 #6

0

파일 보기

파일: string_processing.py 프로젝트: ItsLastDay/Twitter-language-identification

def get_links(text):
    # checks only for  'http://...' and 'www...'
    text = text + " "
    pat = "http://.*?\s"
    links = nltk.regexp_tokenize(text, pat)
    text = " " + text + " "
    pat = "\swww\..*?\..*?\s"
    links.extend(nltk.regexp_tokenize(text, pat))
    links = map(lambda x: x[:-1], links)
    return links

예제 #7

0

파일 보기

파일: stem.py 프로젝트: rve/keyword

def poss_test(test_file,test_write,sw_file):
    """
    
    Arguments:
    - `train_file`:
    """
    a = 0
    f = open(test_file)
    reader = csv.reader(f)

    t = open(test_write,"w")

    sw = open(sw_file)
    sw = sw.readlines()
    sw = [word.strip() for word in sw]
    
    stopwords = sw
    print "停顿词表长度",len(stopwords)
    stopwords = set(stopwords)

    g = lambda x : x not in stopwords
    
    for row in reader:
        if a == 0:
            a += 1
            continue
        if a%1000 == 0:
            print a    
        a += 1
        #if a == 8:
        #    sys.exit(1)

        title = row[1].lower()
        #clean html
        body = nltk.clean_html(row[2].lower())
        
        #work tokenize
        pattern = r"([a-z])\w+"
        body = nltk.regexp_tokenize(body, pattern)
        title = nltk.regexp_tokenize(title, pattern)

        #light stem
        #title = set([stem(word) for word in title])
        #body = set(body)
        #body = set([stem(word) for word in body])

        #remove stopwords
        #body = filter(g,body)
        #title = filter(g,title)

        body = ' '.join(body)
        title = ' '.join(title)
        t.write('%s , %s \n'%(title,body))

예제 #8

0

파일 보기

파일: nltk_without_stem.py 프로젝트: rve/keyword

def poss_test(test_file,test_write,sw_file):
    """
    
    Arguments:
    - `train_file`:
    """
    a = 0
    f = open(test_file)
    reader = csv.reader(f)

    t = open(test_write,"w")

    sw = open(sw_file)
    sw = sw.readlines()
    sw = [word.strip() for word in sw]
    
    #stopwords = sw 
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords = set(stopwords)

    g = lambda x : x not in stopwords
    
    for row in reader:

        if a%10000 == 0:
            print(a)
        a += 1
        #if a == 8:
        #    sys.exit(1)

        title = row[1].lower()
        #clean html
        body = nltk.clean_html(row[2].lower())
        
        #work tokenize
        pattern = r"(\.?[a-z][a-z0-9\+\.\#\-]+[a-z0-9\+\#])"
        body = nltk.regexp_tokenize(body, pattern)
        title = nltk.regexp_tokenize(title, pattern)
        #remove stopwords
        body = filter(g,body)
        title = filter(g,title)

        #light stem
        title = set([stem(word) for word in title])
        body = set(body)
        body = set([stem(word) for word in body])


        body = ' '.join(body)
        title = ' '.join(title)
        t.write('"%s","%s","%s"\n'%(row[0],title,body))

예제 #9

0

파일 보기

파일: ytquery.py 프로젝트: BrianDurham/couchtube

    def query_episode(self, show_title, 
        ep_title, se_number, ep_number, runtime):
        """build video list prior to scoring
        """
        qres = {}

        # Query 1
        qlist = (show_title, ep_title)
        # Search YouTube
        tmp = self.search('%s %s' % qlist)
        for k, v in tmp.items():
            qres[k] = v
        # Query 2
        qlist = (show_title, ep_title, 
            se_number, ep_number)
        # Search YouTube
        tmp = self.search('%s %s  %s  %s' % qlist)
        for k, v in tmp.items():
            qres[k] = v
        # Query 3
        qlist = (show_title, 
            se_number, ep_number)
        # Search YouTube
        tmp = self.search('%s s%02de%02d' % qlist)
        for k, v in tmp.items():
            qres[k] = v

        # Show tokens
        sh_stem = [self._lancaster.stem(t) \
            for t in nltk.regexp_tokenize(
                show_title.encode('utf8'), r"\w+")]

        # Episode stem tokens if exist
        if ep_title:
            ep_stem = [self._lancaster.stem(t) \
                for t in nltk.regexp_tokenize(
                    ep_title.encode('utf8'), r"\w+")]
        else:
            ep_stem = None

        res = {'Output': qres, 
               'Input': {},}
        res['Input']['show_title'] = show_title
        res['Input']['ep_title'] = ep_title
        res['Input']['sh_stem'] = sh_stem
        res['Input']['ep_stem'] = ep_stem
        res['Input']['se_number'] = se_number
        res['Input']['ep_number'] = ep_number
        res['Input']['runtime'] = runtime

        return res

예제 #10

0

파일 보기

파일: pre_nltk.py 프로젝트: rve/keyword

def poss_train(train_file,train_write,sw_file):
    """
    
    Arguments:
    - `train_file`:
    """
    a = 0
    f = open(train_file)
    reader = csv.reader(f)

    t = open(train_write,"w")

    sw = open(sw_file)
    sw = sw.readlines()
    sw = [word.strip() for word in sw]
    
    #stopwords = sw  # use nltk stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    print "停顿词表长度",len(stopwords)
    stopwords = set(stopwords)

    g = lambda x : x not in stopwords
    
    for row in reader:
        if a%100000 == 0:
            print a    
        a += 1
        title = row[1].lower()
        #clean html
        body = nltk.clean_html(row[2].lower())
        
        #word tokenize
        pattern = r"([a-z])\w+"
        body = nltk.regexp_tokenize(body, pattern)
        title = nltk.regexp_tokenize(title, pattern)
        
        #remove stopwords
        body = filter(g,body)
        title = filter(g,title)

        #light stem
        #st = LancasterStemmer()
        title = set([stem(word) for word in title])
        body = set(body)
        body = set([stem(word) for word in body])

        # list to string
        body = ' '.join(body)
        title = ' '.join(title)
        t.write('"%s","%s","%s","%s"\n'%(row[0], title,body,row[3]))

예제 #11

0

파일 보기

파일: utils.py 프로젝트: elyase/eikon_challenge

def normalized(text, lowercase=True, fix=True, tuples=False):
    """Tokenize, remove capitalization and exclude punctuation
    """
    if fix:
        text = fix_text(unicode(text))
    pattern = r"""(?x)    # verbose regexps
        \w+(-\w+)*        # words with optional internal hyphens
    """
    result = [w for w in nltk.regexp_tokenize(text, pattern)]
    if lowercase:
        result = [w.lower() for w in nltk.regexp_tokenize(text, pattern)]
    if tuples:
        result = tuple(result)
    return result

예제 #12

0

파일 보기

파일: handleSubject.py 프로젝트: cynricshu/ChinaVis2016

def handleSubject1(outputFile):
    """
    :return: dict
    """
    index = 0
    termdict = dict()
    subjectList = list()

    f = open("data/topic/subject1_w_date.txt")
    for item in f:
        array = item.strip().split("DELIMER")
        count = array[0]
        subject = array[3]

        for (regex, repl) in helper.regexList.items():
            subject = regex.sub(repl, subject)
        for s in helper.specialSet:
            subject = subject.replace(s, "")

        termList = nltk.regexp_tokenize(subject, helper.nltkPattern)  # use nltk-package to participle the subject
        s = ""
        for term in termList:
            if term.lower() not in helper.excludeSet:
                s += term + " "  # reconstruct the subject
                if term not in termdict:
                    termdict[term.strip()] = index
                    index += 1

        if s != "":
            regex = re.compile("\s+")
            s = regex.sub(" ", s)
            subjectList.append("{}DELIMER{}DELIMER{}DELIMER{}".format(count, array[1], array[2], s.strip()))

    fileHelper.writeIterableToFile(outputFile, subjectList)
    return termdict

예제 #13

0

파일 보기

파일: data_process_new.py 프로젝트: GeWu/Ustc.SSE.topicdetection

def tokenprocess(Strtext): 
   
    f = open(Strtext)
    raw = f.read().strip()
    stop_words = stopwords.words('english')
    
    pattern = r'''(?x)([A-Z]\.)+|\w+(-\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_`]'''
    text1 = map(lambda word:word.lower(),nltk.regexp_tokenize(raw,pattern))
    text1_filter = [word for word in text1 if len(word) > 1 and word.find("'") == -1 and word not in stop_words]
    
  
    return text1_filter




#eassylist = corpus_data()
#print eassylist[-1]

#stop_words = stopwords.words('english')
#print stop_words

예제 #14

0

파일 보기

파일: tagger.py 프로젝트: bruckhaus/gutentag

 def tag(text):
     tokens = nltk.regexp_tokenize(text, SENTENCE_REGEX)
     pos_tokens = nltk.tag.pos_tag(tokens)
     chunker = nltk.RegexpParser(GRAMMAR)
     tree = chunker.parse(pos_tokens)
     terms = Tagger.get_terms(tree)
     return Tagger.word_list(terms)

예제 #15

0

파일 보기

파일: text_with_features_getters.py 프로젝트: ItsLastDay/Twitter-language-identification

def russian_get_text(inp, output):
    # parse tweets from .csv file by Julia Rubtsova
    # from 'Метод построения и анализа корпуса коротких текстов для задачи классификации отзывов'
    data = read_data(inp)

    res = []
    pattern = '''"(.*?)";'''
    for line in data:
        tokens = nltk.regexp_tokenize(line, pattern)
        if len(tokens) < 4:
            continue

        text = tokens[3][1:-2]
        mentions = get_mentions(text)
        links = get_links(text)
        hashtags = get_hashtags(text)
        text = process(text)
        sname = tokens[2][1:-2]
        if text == '':
            continue

        row = [text, 'not-given', sname, 'not-given', ','.join(hashtags), ','.join(mentions), ','.join(links)]
        row = '\t'.join(row)

        res.append(row)

    write_data(output, res)

예제 #16

0

파일 보기

파일: collocationreadability.py 프로젝트: muranava/Text-Tools

def ShowCollocations():
	text.insert(END, "If this doesn't work, please check you have NLTK, PyYAML and the stopword list from the NLTK loaded. See Help for details \n\n\n")
	import nltk
	from nltk.collocations import BigramCollocationFinder
	from nltk.collocations import TrigramCollocationFinder
	from nltk.metrics import BigramAssocMeasures
	from nltk.metrics import TrigramAssocMeasures
	pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']'''
	data = resultsbox.get(1.0,END)
	rawtext=nltk.regexp_tokenize(data, pattern)
	prepcolloc = [word.lower() for word in rawtext if not word in stopwords and word.isalpha()]
	text.delete(1.0, END)
	text.insert(END, "Collocations (occurring at least 3 times with a PMI of 10)\n")
	text.insert(END, "\nBigram Collocations:\n")
	bigram = BigramAssocMeasures()
	bigramfinder = BigramCollocationFinder.from_words(prepcolloc)
	bigramfinder.apply_freq_filter (3)
	bigrams=bigramfinder.nbest(bigram.pmi, 10)
	for item in bigrams:
		first = item[0]
		second = item[1]
		text.insert(END, first)
		text.insert(END, " ")
		text.insert(END, second)
		text.insert(END, "\n")

예제 #17

0

파일 보기

파일: GrammarParser.py 프로젝트: hongyu89/IndeedScraper

    def main(self, text):
        """Breaks a single string into a tree using the grammar and returns
        the specified words as a string."""

        if text is None:
            return None

        try:
            text = text.encode("ascii", "ignore")
        except:
            text = text.decode("utf-8", "ignore").encode("ascii", "ignore")

        chunker = nltk.RegexpParser(grammar)

        toks = nltk.regexp_tokenize(text, sentence_re)
        postoks = nltk.tag.pos_tag(toks)

        #print postoks
        tree = chunker.parse(postoks)

        terms = self.get_terms(tree)

        words = self.get_words(terms)

        return words

예제 #18

0

파일 보기

파일: attr_util_mk.py 프로젝트: jesusmiguelgarcia/FSTmikes

def longitud_promedio_palabras_moens(lista):
    regexp = "[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+"
    total_palabras_en_oraciones = 0
    num_oraciones = 0
    tokens = 0
    promedio_longitud_palabras_oraciones = []
    for oracion in lista:
        total_palabras_oracion = 0
        num_palabras_oracion = 0
        tokens = nltk.regexp_tokenize(oracion, regexp)
        total_palabras_en_oraciones += len(tokens)
        for palabra in tokens:
            total_palabras_oracion += len(palabra)
            num_palabras_oracion += 1
            #print palabra
            #print len(palabra)
        if total_palabras_oracion > 0:
            promedio_longitud_palabras_oraciones.append(total_palabras_oracion/num_palabras_oracion)
        else:
            print oracion
        #print len(tokens)
        #total += len(oracion.split())
        num_oraciones += 1
    #promedio = total_palabras_en_oraciones / num_oraciones
    #print promedio_longitud_palabras_oraciones
    suma_promedios=0
    num_promedios = 0
    for promedios in promedio_longitud_palabras_oraciones:
        suma_promedios += promedios
        num_promedios += 1
    promedio = suma_promedios/num_promedios
        
    #promedio = sum(promedio_longitud_palabras_oraciones)/float(len(promedio_longitud_palabras_oraciones))    
    return promedio

예제 #19

0

파일 보기

파일: BayesClassif.py 프로젝트: nsmalimov/Naive_Bayes_classifier_Python

def classif(text, mass, num_all_docs, num_words_unic):
    stm = Stemmer('russian')
    text = stm.stemWords(regexp_tokenize((text.decode('UTF-8')).lower(), r"(?x) \w+ | \w+(-\w+)*"))
    num_povt_words = 0
    summa = 0
    while_iter = 0
    while while_iter < len(mass):
        summand_1 = log((mass[while_iter].num_docs + 0.0) / (num_all_docs + 0.0) + 0.0, 1.1)
        for i in text:
            for i1 in mass[while_iter].lst_allword:
                if i == i1:
                    num_povt_words = num_povt_words + 1
            summand_2 = log(((num_povt_words + 1) + 0.0) / ((num_words_unic + mass[while_iter].num_words) + 0.0), 1.1)
            num_povt_words = 0
            summa = summa + summand_2
        mass[while_iter].c = summand_1 + summa
        summa = 0
        while_iter = while_iter + 1

    max_c = -100000
    while_iter = 0
    number_max = 0

    while while_iter < len(mass):
        print mass[while_iter].c
        if mass[while_iter].c > max_c:
            max_c = mass[while_iter].c
            number_max = while_iter
        while_iter = while_iter + 1
    print mass[number_max].name_categories

예제 #20

0

파일 보기

파일: attr_util_mk.py 프로젝트: jesusmiguelgarcia/FSTmikes

def numero_puntuacion_moens(texto):
    regexp = "[/,$?:;!()&%#=+{}*~.]+"
    tokens = nltk.regexp_tokenize(texto, regexp)
    total = len(tokens)
    print len(tokens)    
    print tokens    
    return total

예제 #21

0

파일 보기

파일: attr_util_mk.py 프로젝트: jesusmiguelgarcia/FSTmikes

def word_couple_con_puntuacion_pares_minusculas(lista):
    word_couples = []
    
    
    regexp = "[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+-*[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+|[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+|[.]+|[/,$?:;!()&%#=+{}*~.]+|[0-9]+"
    
    for oracion in lista:
        
        #oracion = str(oracion)
        #oracion = oracion.to_lower
        #print oracion
        
        
        tokens = nltk.regexp_tokenize(oracion.lower(), regexp)
        #print len(tokens)
        
#         tokens_lower = []
#         for i in range(len(tokens)):
#             palabra = str(tokens[i])
#             tokens_lower.append(palabra.to_lower() )          
            
        
        pairs = list(itertools.permutations(tokens, 2))
        for pair in pairs:
            word_couples.append(pair[0]+"~"+pair[1])
        
    return word_couples

예제 #22

0

파일 보기

파일: 9.py 프로젝트: Anastasia1302/nltk

def tokenize_punctuation(t):
	"""Tokenizes the punctuation in a text 't'."""
	pattern = r'''(?x)			# set to be verbose
	\W 						# searches for non-alphanumeric characters.
	'''
	matches = nltk.regexp_tokenize(t, pattern)
	return matches

예제 #23

0

파일 보기

파일: freud.py 프로젝트: assamite/agentwordgame

 def extract(self, text):
     ''' Extract and freudify noun phrases from text, return all succesfully
     freudified noun phrases. '''
     
     toks = nltk.regexp_tokenize(text, self.sentence_re)
     postoks = nltk.tag.pos_tag(toks)
     tree = self.chunker.parse(postoks)
     terms = self._get_terms(tree)
     
     phrases = sets.Set()
     
     # Loop through all the noun phrases and try to freudify them.
     for term in terms:
         if (len(term)) < 2: continue
         changed = False
         context = ""
         phrase = []
         for part in term:
             word, tag = part
             word = word.encode('ascii', 'replace')
             phrase.append(word.lower())
             rpl = self.replace_word(tag[:2], word)
             if len(rpl[2]) > 0:
                 context = rpl[2]
                 phrase[-1] = rpl[0]
                 changed = True
         if changed:
             phrase = " ".join(phrase).strip()
             phrase.encode('ascii', 'replace')
             phrase = str(phrase)
             if phrase not in self.own_phrases[context]:
                 phrases.add((str(phrase), context))    
       
     phrases = list(phrases)      
     return phrases

예제 #24

0

파일 보기

파일: dataset_tools.py 프로젝트: tizot/recom-system

def generate_vocab(papers):
    """Returns the vocabulary used in the papers given in parameters, after cleaning and stopwords removal.

    Args:
        papers (list of tuples): the raw list of papers from which generates the vocabulary (each element is a tuple of 3 strings: id, title and abstract)

    Returns:
        list of strings: the list of tokens forming the vocabulary
    """
    sc = StringCleaner()

    # Generate author's vocabulary
    corpus = " ".join(p[1] + " " + p[2] for p in papers)
    # Cleaning
    corpus = sc.clean_string(corpus)
    # Tokenization
    pattern = r"(?:[A-Z]\.)+|\w+(?:-\w+)*|\d+(?:\.\d+)?%?"
    #         we keep tokens that are words (with optional internal hyphens), acronyms and percentages
    tokens = set(nltk.regexp_tokenize(corpus, pattern)) - set(nltk.corpus.stopwords.words("english"))
    num_re = re.compile("^\d+$")
    tokens = set([t for t in tokens if not num_re.match(t)]) # we remove only-numeric tokens
    # Stemming
    porter = nltk.stem.PorterStemmer()

    return [porter.stem(t) for t in tokens]

예제 #25

0

파일 보기

파일: AO_mGradeDocumentReadability.py 프로젝트: DrOttensooser/BiblicalNLPworks

def AO_lTokenize(AO_sText):


    '''
        This brreakes a text into individual words
        Adapted From Natural Language Processing with Python
    '''
    regex = r'''(?xi)
    (?:H|S)\.\ ?(?:(?:J|R)\.\ )?(?:Con\.\ )?(?:Res\.\ )?\d+ # Bills
  | ([A-Z]\.)+                                              # Abbreviations (U.S.A., etc.)
  | ([A-Z]+\&[A-Z]+)                                        # Internal ampersands (AT&T, etc.)
  | (Mr\.|Dr\.|Mrs\.|Ms\.)                                  # Mr., Mrs., etc.
  | \d*\.\d+                                                # Numbers with decimal points.
  | \d\d?:\d\d                                              # Times.
  | \$?[,\.0-9]+\d                                          # Numbers with thousands separators, (incl currency).
  | (((a|A)|(p|P))\.(m|M)\.)                                # a.m., p.m., A.M., P.M.
  | \w+((-|')\w+)*                                          # Words with optional internal hyphens.
  | \$?\d+(\.\d+)?%?                                        # Currency and percentages.
  | (?<=\b)\.\.\.(?=\b)                                     # Ellipses surrounded by word borders
  | [][.,;"'?():-_`]
    '''
    # Strip punctuation from this one; solr doesn't know about any of it
    tokens = regexp_tokenize(AO_sText, regex)
    # tokens = [re.sub(r'[.,?!]', '', token) for token in tokens]  # instead of this we just test word length
    return tokens

예제 #26

0

파일 보기

파일: adjust_word.py 프로젝트: CharLLCH/work-for-py

def handle_doc(word_set,rs_path):
    doc_dir = os.listdir(rs_path)
    doc_matrix = []
    doc_cat = []
    for docs in doc_dir:
        files = os.listdir(rs_path+docs)
        print "start to handle the -->  "+docs
        for file_d in files:
            d_path = rs_path+docs+'/'+file_d
            #get the single file path
            with open(d_path,'rb') as text_file:
                str_tmp = ''
                file_lines = text_file.readlines()
                for line in file_lines:
                    pattern = r'''[a-zA-Z]+'''
                    tokens = nltk.regexp_tokenize(line,pattern)
                    for t in tokens:
                        if t.lower() in word_set:
                            str_tmp += t.lower()
                            str_tmp += ' '
                doc_matrix.append(str_tmp)
                doc_cat.append(cat_dic[docs])
            text_file.close()
    str_tmp = ''
    for sw in word_set:
        str_tmp += sw
        str_tmp += ' '
    doc_matrix.append(str_tmp)
    doc_cat.append('NAN')
    vectorizer = CountVectorizer()
    doc_num = vectorizer.fit_transform(doc_matrix)
    tfidf = TfidfTransformer()
    doc_tfidf = tfidf.fit_transform(doc_num)
    return doc_tfidf[:-1,:],doc_cat[:-1]

예제 #27

0

파일 보기

파일: EnglishWordSegment.py 프로젝트: ZhuJiahui/MicroblogDataStreamCompress

def word_segment(data, mark_stop, english_stop):
    """
    分词并去除停用词
    :param data:
    :param stopwords_list:
    """

    """
    segment_text = nltk.word_tokenize(data.replace('.', ' '))
    segment_text = [word.lower() for word in segment_text if word.lower() not in (english_stop + mark_stop)]
    segment = nltk.pos_tag(segment_text)  #词性标注
    """

    pattern = r"""(?x)([A-Z]\.)+|\w+(-\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;'"?():-_`]"""
    segment_text = nltk.regexp_tokenize(data, pattern)
    # 可选择取词干
    # porter = nltk.PorterStemmer()
    segment_text = [t.lower() for t in segment_text if t.lower() not in (english_stop + mark_stop)]
    segment = nltk.pos_tag(segment_text)  # 词性标注

    segment_list = []
    for item in segment:
        segment_list.append(item[0] + "," + item[1])

    return segment_list

예제 #28

0

파일 보기

파일: keyword_seeding_using_nltk.py 프로젝트: gwen-garrison/Project_WunderLand

def tokenize_tag_text(description):
    """Removes some punctuation, tags each word by part-of-speech, and generates keyword and 
    keyword prhases  based on noun phrases patterns using regexp."""

    sentence_re = r'''(?x)
    ([A-Z])(\.[A-Z])+\.?  # set flag to allow verbose regexps
    | \w+(-\w+)*          # words with optional internal hyphens
    | \$?\d+(\.\d+)?%?    # currency and percentages
    | \.\.\.              # ellipsis
    | [][.,;"?():-_`]     # separate tokens
    '''

    grammar = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}             # Nouns and Adjectives, terminated with Nouns
        {<NNP|NNPS>+<IN>?<NNP|NNPS>+}  # A sequence of proper nouns connected with zero or more prepositions
        {<DT|PP\$>?<JJ>*<NN|NNS>}      # Determiners (e.g. 'the', 'a') or possessive, followed by one or more adjective 
        {<NN>+}                        # A sequence of one or more nouns

    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  
    """

    chunker = nltk.RegexpParser(grammar)
    toks = nltk.regexp_tokenize(description, sentence_re)
    postoks = nltk.tag.pos_tag(toks)
    tree = chunker.parse(postoks)
    return tree

예제 #29

0

파일 보기

파일: keyword_extract.py 프로젝트: luotigerlsx/DataAnalysis_ML

 def compute_df(self, document_list):
     '''Compute document frequency based on input document list'''  
     df_cache = dict()
     df_output = dict()
     
     d_index = 0
     for document in document_list:
         d_index += 1
         # tokenize each document
         reg_toks = nltk.regexp_tokenize(document, SENTENCE_RE)
         for item in reg_toks:
             # change each word to lower case and lemmatize
             item = normalise(item)
             if item not in df_cache:
                 df_cache[item] = set([d_index])
             else:
                 df_cache[item].add(d_index)
     
     for item in df_cache:
         if acceptable_word(item):
             df_output[item] = len(df_cache[item])
     
     df_output['total_document'] = len(document_list)
     
     return df_output

예제 #30

0

파일 보기

파일: dir_scraper.py 프로젝트: norvigaward/naward12

        def parse(self, response):
	    fd = nltk.FreqDist()
	    punct = set(string.punctuation)
	    self.i += 1
	    titles = Selector(response=response).xpath('//title/text()').extract()
            filename = response.url.split("/")[-2]
	    filedir = dirs[self.i - 1] + '/' + filename
	    filedir = 'Top' + '/' + filedir.split('/')[1] + '-' + filedir.split('/')[2]
	    print dirs[self.i - 1]
	    print filedir
	    temp = stripAllTags(response.body)
	    s = MLStripper()
	    s.feed(temp)
	    pure_body = s.get_data()
	    pure_body = pure_body.lower()
	    pure_body = unicodedata.normalize('NFKD', pure_body).encode('ASCII', 'ignore')
	    for word in nltk.regexp_tokenize(pure_body, pattern=r'\.|(\s+)', gaps = True):
		if word not in punct and word not in common_words:
		    fd.inc(word)
	    freq_tuples = fd.items()
	    if not(os.path.exists(filedir)):
		os.makedirs(filedir)
	    filedir = filedir + '/' + filename
            with open(filedir, 'wb') as f:
		f.write('@attribute ' + filedir.split('/')[1] + ' {0,1}\n\n' + '@data\n')
		f.write('\n')
		for title in titles:
		    f.write(title.encode('utf-8').strip())
		f.write('\n')
                for item in freq_tuples:
		    i = 0
		    while i < item[1]:
			i+=1
			f.write(item[0] + '\n')

예제 #31

0

파일 보기

파일: utils.py 프로젝트: rgzn-aiyun/CNN_for_XSS

def URLDECODE(XSS):

    XSS = XSS.lower()
    XSS = unquote(unquote(XSS))
    XSS, num = re.subn(r'\d+', "0", XSS)

    XSS, num = re.subn(r'(http|https)://[a-zA-Z0-9\.@&/#!#\?]+', "http://u",
                       XSS)

    r = '''
        (?x)[\w\.]+?\(
        |\)
        |"\w+?"
        |'\w+?'
        |http://\w
        |</\w+>
        |<\w+>
        |<\w+
        |\w+=
        |>
        |[\w\.]+
    '''
    return nltk.regexp_tokenize(XSS, r)

예제 #32

0

파일 보기

파일: preprocess.py 프로젝트: imclab/Topic-Aware-Similarity

def get_docs(file_name):
	in_put = open(file_name, 'rU')
	raw = in_put.readlines()

	#保留有摘要的paper 序号
	paper_id = [raw.index(w) for w in raw if w != 'null\n']
	raw = [w.lower() for w in raw if w != 'null\n']

	docs = [nltk.regexp_tokenize(w, pattern) for w in raw]

	#只保留英文单词,有-的会被删掉,删除停用词
	for i in xrange(len(docs)):
		docs[i] = [w for w in docs[i] if w.isalpha() and w not in stop_word]

	wnl = nltk.WordNetLemmatizer()

	#词形还原
	for i in xrange(len(docs)):
		docs[i] = [wnl.lemmatize(t) for t in docs[i]]

	in_put.close()

	return docs

예제 #33

0

파일 보기

def computeSentiment(text):
    # Tokenize and remove stop words
    tokens = []
    for t in nltk.regexp_tokenize(text.lower(), '[a-z]+'):
        if t not in sr:
            tokens.append(t)
    tokens[:10]

    # Count the number of positive and negative words.
    pos_count = 0
    neg_count = 0
    for t in tokens:
        if t in pos_words:
            pos_count += 1
        elif t in neg_words:
            neg_count += 1

    # Compute sentiment
    if (pos_count + neg_count) > 0:
        sentiment = float(pos_count - neg_count) / float(pos_count + neg_count)
    else:
        sentiment = 0
    return sentiment

예제 #34

0

파일 보기

def GeneSeg(payload):
    #数字泛化为"0"
    payload = payload.lower()
    #payload=unquote(unquote(payload))   这里已经解过码了，所以不用再解码了
    payload, num = re.subn(r'\d+', "0", payload)
    #替换url为”http://u
    payload, num = re.subn(r'(http|https)://[a-zA-Z0-9\.@&/#!#\?]+',
                           "http://u", payload)
    #分词
    r = '''
        (?x)[\w\.]+?\(
        |\)
        |"\w+?"
        |'\w+?'   
        |http://\w
        |</\w+>
        |<\w+>
        |<\w+
        |\w+=
        |>
        |[\w\.]+
    '''
    return nltk.regexp_tokenize(payload, r)

예제 #35

0

파일 보기

파일: ngram2.py 프로젝트: Times125/Graduation_Project

    def text_parse(cls, x):
        try:
            sentence = x.strip().lower()
        except:
            sentence = x

        sentence = re.sub(cls.hndl_regex, cls.hndl_repl, sentence)  # 匹配替换@***
        sentence = re.sub(cls.hash_regex, cls.hash_repl, sentence)  # 匹配替换#***
        sentence = re.sub(cls.url_regex, cls.url_repl, sentence)  # 匹配替换URL
        sentence = re.sub(cls.rpt_regex, cls.rpt_repl, sentence)  # 匹配替换类似yoooooooo为yoo

        emoticons_regex = [(repl, re.compile(cls.regex_union(cls.escape_paren(regx)))) for (repl, regx) in
                           cls.emoticons]  # 匹配替换表情
        for (repl, regx) in emoticons_regex:
            sentence = re.sub(regx, ' ' + repl + ' ', sentence)

        pattern = r""" (?x)(?:[a-z]\.)+ 
                        | \d+(?:\.\d+)?%?\w+
                        | \w+(?:[-']\w+)*
                        | (?:[-.!?]{2,})
                        | [][.,;"'?():$-_*`]"""
        word_list = nltk.regexp_tokenize(sentence, pattern)
        return word_list

예제 #36

0

파일 보기

파일: AO_mGradeDocumentReadability.py 프로젝트: DrOttensooser/ShakespeareNLPworks

def AO_lTokenize(AO_sText):
    '''
        This brreakes a text into individual words
        Adapted From Natural Language Processing with Python
    '''
    regex = r'''(?xi)
    (?:H|S)\.\ ?(?:(?:J|R)\.\ )?(?:Con\.\ )?(?:Res\.\ )?\d+ # Bills
  | ([A-Z]\.)+                                              # Abbreviations (U.S.A., etc.)
  | ([A-Z]+\&[A-Z]+)                                        # Internal ampersands (AT&T, etc.)
  | (Mr\.|Dr\.|Mrs\.|Ms\.)                                  # Mr., Mrs., etc.
  | \d*\.\d+                                                # Numbers with decimal points.
  | \d\d?:\d\d                                              # Times.
  | \$?[,\.0-9]+\d                                          # Numbers with thousands separators, (incl currency).
  | (((a|A)|(p|P))\.(m|M)\.)                                # a.m., p.m., A.M., P.M.
  | \w+((-|')\w+)*                                          # Words with optional internal hyphens.
  | \$?\d+(\.\d+)?%?                                        # Currency and percentages.
  | (?<=\b)\.\.\.(?=\b)                                     # Ellipses surrounded by word borders
  | [][.,;"'?():-_`]
    '''
    # Strip punctuation from this one; solr doesn't know about any of it
    tokens = regexp_tokenize(AO_sText, regex)
    # tokens = [re.sub(r'[.,?!]', '', token) for token in tokens]  # instead of this we just test word length
    return tokens

예제 #37

0

파일 보기

파일: 34_2. chatbot_interaction.py 프로젝트: omy5123/Oh-min-young

def talk_to_bot():
    vocab = chatbot.read_vocab()
    vectors = chatbot.read_vectors()

    # 문제
    # 질문과 대답 데이터에서 가장 긴 토큰의 길이를 구하세요.
    dialog_questions = vectors[::2]
    dialog_answers = vectors[1::2]

    max_len_q = max([len(q) for q in dialog_questions])
    max_len_a = max([len(a) for a in dialog_answers]) + 1
    print(max_len_q, max_len_a)  # 9 10

    # ------------------------------------------------ #

    model = tf.keras.
    onehot = np.eye(len(vocab), dtype=np.float32)

    while True:
        sys.stdout.write('왕자: ')
        sys.stdout.flush()

        line = sys.stdin.readline()
        line = line.strip()

        if '끝' == line:
            break

        # 문제
        # 입력 문장을 토큰으로 분리하세요.
        # tokens = line.split()         # 공백이 여러 개 일때(?)
        tokens = nltk.regexp_tokenize(line, r'\w+')
        # print(tokens)                 # ['이리', '와서', '나하고', '놀자']

        # 문제
        # 토큰을 질문으로 변환하세요. (문자열 토큰을 숫자로 변환하세요)
        question = [vocab.index(t) if t in vocab else chatbot._UNK_ for t in tokens]

예제 #38

0

파일 보기

파일: NLP.py 프로젝트: kumarkovid/IphoneXS_VS_Samsung_s10_comparison

def analyze2(text2):
    # takes a list of comment strings and tokenizes and finds pairs of positive and negative words with specific phone features
    new=[]
    tokens=[]
    count=0
    negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
    with open("positive-words.txt",'r') as f:
        positive_words=[line.strip() for line in f]
    with open("negative-words.txt",'r') as f:
        negative_words=[line.strip() for line in f]
    positive_tokens=[]
    negative_tokens=[] #N is 1 this time, negating word right before pos or neg word
    reviewpos=[]
    reviewneg=[]
    features=["headphones", "battery", "sound", "charge", "screensize", "size", "space", "storage", "camera", "speed", "display", "sensor", "casing", "price"]
    for text in text2:
        #text=text.strip(string.punctuation)
        #text=text.strip(" ")
        #tokens=nltk.word_tokenize(text)
        #tokens = re.split(r"\W+", text)
        pattern=r'\w[\w\'-]*\w'      
        tokens=nltk.regexp_tokenize(text, pattern)
        tokens=[tokens.lower() for tokens in tokens]
        #tokens=[token.strip(string.punctuation) for token in tokens]
        #tokens=[token.strip() for token in tokens if token.strip()!='']
        new.append(tokens) #change += to not have seperated list per comment
        count+=1
    for x in new:
        for i in range(0, len(x)):
            previ=""
            if i>0:
                previ=x[i-1]
            if previ in positive_words and x[i] in features:
                    reviewpos.append((previ,x[i]))
            if previ in negative_words and x[i] in features:
                    reviewneg.append((previ,x[i]))
    return reviewpos, reviewneg

예제 #39

0

파일 보기

파일: preprocess_functions.py 프로젝트: masongcm/csshack-gongos

def tknse(s):
    """
    Tokenises a sentence string in a suitable to way to analyse
    both bible text and twitter data 
    (e.g. catching and filtering out mentions, URLs, ...)
    """

    import string
    from nltk import regexp_tokenize

    # define pattern for regexp
    pattern = [
        r'<[^>]+>',  # HTML tags (drop)
        r'(?:@[\w_]+)',  # @-mentions (catch and filter drop)
        #r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags (keep as words)
        r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',  # URLs (catch and filter)
        r'(?:(?:\d+,?)+(?:\.?\d+)?)',  # numbers (drop)
        r'(?:[A-Z]\.)+',  # abbreviations, e.g. U.S.A.
        #r'\$?\d+(?:\.\d+)?%?', # currency and percentages, e.g. $12.40, 82% (leave as numbers)
        r'(?:[\w_]+)',  # other words
        r'(?:\S)'  # anything else
    ]
    pattern = r'(' + '|'.join(
        pattern) + ')'  # collapse into single regex string
    tok = regexp_tokenize(s, pattern)  # tokenise

    # filter out unwanted tokens
    tok = list(
        filter(
            lambda w: (w[0] not in ['@']) and  # @-mentions
            (w[0:4].lower() != 'http') and  # URLs
            (w.replace('.', '', 1).isdigit() == False) and  # numbers
            (w not in string.punctuation),
            tok))
    # to lower case
    tok = [w.lower() for w in tok]  # lower case only
    return (tok)

예제 #40

0

파일 보기

def text_parse(input_text, language='en'):

    sentence = input_text.strip().lower()
    sentence = re.sub(r'@\s*[\w]+ | ?#[\w]+ | ?&[\w]+; | ?[^\x00-\xFF]+', '',
                      sentence)
    special_tag = {
        '.', ',', '#', '!', '(', ')', '*', '`', ':', '"', '‘', '’', '“', '”',
        '@', '：', '^', '/', ']', '[', ';', '=', '_'
    }
    pattern = r""" (?x)(?:[a-z]\.)+ 
                  | \d+(?:\.\d+)?%?\w+
                  | \w+(?:[-']\w+)*"""
    word_list = regexp_tokenize(sentence, pattern)

    filter_word = []
    if language == 'en':
        filter_word = [
            w for w in word_list
            if w not in stopwords.words('english') and w not in special_tag
        ]  # 去停用词和特殊标点符号
    word_tag = pos_tag(
        filter_word, tagset=None,
        lang=language)  # 词性标注，返回标记列表[('Codeine', 'NNP'), ('15mg', 'CD')

    res_word_list = []
    lemmatizer = WordNetLemmatizer()  # 词形还原
    tag_list = {
        'TO', 'RB', 'RBR', 'RBRS', 'UH', 'WDT', 'WP', 'WP$', 'WRB', 'SYM',
        'RP', 'PRP', 'PRP$', 'CD', 'POS', ':'
    }
    for i in range(0, len(word_tag)):  # 去掉副词、介词、小品词、疑问词、代词、人称代词、所有格代名词等
        if word_tag[i][1] in tag_list:
            continue
        else:
            word = lemmatizer.lemmatize(word_tag[i][0])
            res_word_list.append(word)
    return res_word_list

예제 #41

0

파일 보기

def tokenExtractor(file):
    doc = xml.dom.minidom.parse(file)

    movieText = ""

    for item in doc.getElementsByTagName("s"):
        for child in item.childNodes:
            if child.nodeName == "#text" and len(re.findall("\w", child.nodeValue)) > 1:
                movieText += child.nodeValue

    movieText = re.sub("\n\s+", " ", movieText)
    movieText = re.sub("\n", "", movieText)

    pattern = r'''(?x)(?:[A-Z]\.)+ | \w+(?:-\w+)*  | '''
    tokens = nltk.regexp_tokenize(movieText, pattern)

    # Remove non alphanumericbetic characters
    tokens = [w for w in tokens if re.search(r'\w', w)]

    # Remove stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    tokens = [w for w in tokens if w.lower() not in stopwords]
    stopwords = nltk.corpus.stopwords.words('spanish')
    tokens = [w for w in tokens if w.lower() not in stopwords]
    stopwords = nltk.corpus.stopwords.words('french')
    tokens = [w for w in tokens if w.lower() not in stopwords]
    stopwords = nltk.corpus.stopwords.words('italian')
    tokens = [w for w in tokens if w.lower() not in stopwords]

    # Remove numbers
    tokens = [w for w in tokens if not re.search(r'\d', w)]

    # Lower case
    tokens = [t.lower() for t in tokens]
    tokens = [item for item in tokens if item.isalpha()]

    return tokens

예제 #42

0

파일 보기

def get_pos_tags(text):
    """Used when tokenizing words"""
    text = tostring(text)
    regex_patterns = r"""(?x)      # set flag to allow verbose regexps
          (?:[A-Z]\.)+  # abbreviations, e.g. U.S.A.
        | \w+(?:-\w+)*            # words with optional internal hyphens
        | \$?\d+(?:\.\d+)?%?      # currency and percentages, e.g. $12.40, 82%
        | \.\.\.                # ellipsis
        | [][.,;"'?():-_`]      # these are separate tokens
    """
    # POS tagging
    # postoks = nltk.pos_tag(text.split())
    toks = nltk.regexp_tokenize(text, regex_patterns)
    assert isinstance(toks, list), "toks is not a list of str, cannot tokenize."
    postoks = nltk.tag.pos_tag(toks)
    # fix a weird pos-tagging error in NLTK
    prior_pos = ''
    for i in range(0, len(postoks)):
        if prior_pos == 'TO' and 'VB' not in postoks[i][1]:
            old = postoks.pop(i)
            postoks.insert(i, (old[0], 'VB'))
        prior_pos = postoks[i][1]
    # print('getPOStags_returns:', postoks)
    return postoks

예제 #43

0

파일 보기

def keyphrase_sentence(sentence):
    sentence_re = r'''(?x)          
      (?:[A-Z]\.)+             
    | \w+(?:-\w+)*        
    | \$?\d+(?:\.\d+)?%?
    | \.\.\.              
    | [][.,;"'?():_`-]    
    '''

    toks = nltk.regexp_tokenize(sentence.lower(),
                                sentence_re)  # sentence tokenisation

    postoks = nltk.tag.pos_tag(toks)

    for i in range(len(postoks)):
        if postoks[i][1][0] == 'N' or postoks[i][1] == 'RB' or postoks[i][
                1] == 'DT':  # check if 'N'/'RB'/'DT' is appearing in pos list.
            token_ls = toks[i:len(
                toks)]  ## span of keyphrase ( starting point - 'N'/'RB'/'DT',
            ## ending point - ending of that sentence )
            token_ls = [i for i in token_ls if i not in stop_word_ls
                        ]  # remove stopwords from phrases.
            if len(token_ls) >= 3:
                return " ".join(token_ls)

예제 #44

0

파일 보기

def test_handle(word_list,tr_path):
    docdir_list = os.listdir(tr_path)
    test_m = []
    test_cat = []
    for dd in docdir_list:
        file_list = os.listdir(tr_path+dd)
        print "handling the---> "+dd+" <---directory.."
        for fpath in file_list:
            d_path = tr_path + dd + '/' + fpath
            with open(d_path,"rb") as text_file:
                str_tmp = ''
                test_cat.append(cat_dic[dd])
                fl = text_file.readlines()
                test_por = nltk.PorterStemmer()
                for doc_line in fl:
                    pattern = r'''[a-zA-Z]+'''
                    tokens = nltk.regexp_tokenize(doc_line,pattern)
                    for t in tokens:
                        if t.lower() in word_list:
                            str_tmp += t.lower()
                            str_tmp += ' '
                test_m.append(str_tmp)
            text_file.close()
    #最后增加一维把所有的wordset加进去？！try once!
    str_tmp = ''
    for sw in word_list:
        str_tmp += sw
        str_tmp += ' '
    test_m.append(str_tmp)
    test_cat.append(10)
    vectorizer = CountVectorizer()
    doc_m = vectorizer.fit_transform(test_m)
    tfidf = TfidfTransformer()
    test_matrix = tfidf.fit_transform(doc_m)
    #test_matrix = log_sparsematrix(test_matrix)
    return test_matrix,test_cat

예제 #45

0

파일 보기

파일: app.py 프로젝트: fromdatavistodatascience/capstone-flask-app-template-seattle-ds-062419

def get_clean_text_pattern(recomposed_note):
    """Function that filters through the notes, retrieves those that match
     the specified pattern and removes stopwords."""
    pattern = "([a-zA-Z0-9\\\]+(?:'[a-z]+)?)"
    recomposed_note_raw = nltk.regexp_tokenize(recomposed_note, pattern)
    # Create a list of stopwords and remove them from our corpus
    stopwords_list = stopwords.words('english')
    stopwords_list += list(string.punctuation)
    # additional slang and informal versions of the original words had to be added to the corpus.
    stopwords_list += ([
        "im", "ur", "u", "'s", "n", "z", "n't", "brewskies", "mcd’s", "Ty$",
        "Diploooooo", "thx", "Clothessss", "K2", "B", "Comida", "yo", "jobby",
        "F", "jus", "bc", "queso", "fil", "Lol", "EZ", "RF", "기프트카드", "감사합니다",
        "Bts", "youuuu", "X’s", "bday", "WF", "Fooooood", "Yeeeeehaw", "temp",
        "af", "Chipoodle", "Hhuhhyhy", "Yummmmers", "MGE", "O", "Coook",
        "wahoooo", "Cuz", "y", "Cutz", "Lax", "LisBnB", "vamanos", "vroom",
        "Para", "el", "8==", "bitchhh", "¯\\_(ツ)_/¯", "Ily", "CURRYYYYYYY",
        "Depósito", "Yup", "Shhhhh"
    ])

    recomposed_note_stopped = ([
        w.lower() for w in recomposed_note_raw if w not in stopwords_list
    ])
    return recomposed_note_stopped

예제 #46

0

파일 보기

def processPreDiffCode(code):
    code = re.sub(r'(\"[\s\S]*?\")', '', code, 0, re.I)
    code = re.sub(r'(@@[\s\S]*?\n)', '', code, 0, re.I)
    code = re.sub(r'(\+[\s\S]*?\n)', '', code, 0, re.I)
    result = []
    mis = methodInvocationCase.findall(code)
    for mi in mis:
        miWords = mi.split('\.')
        for miWord in miWords:
            toDeal = []
            if camelCase1.match(miWord) or camelCase2.match(miWord):
                toDeal = splitCode(miWord)
            elif upperExtCase.match(miWord):
                toDeal = splitFinalExt(miWord)
            elif upperCase.match(miWord):
                toDeal.append(miWord)
            for deal in toDeal:
                if not isDelete(deal.lower()):
                    result.append(stemmer.stem(deal))

    code = re.sub(r'([A-Za-z0-9_]+\.[A-Za-z0-9_]+)', '', code, 0, re.I)
    sentences = tokenizer.tokenize(code)
    for sentence in sentences:
        words = nltk.regexp_tokenize(sentence, pattern)
        for word in words:
            toDeal = []
            if camelCase1.match(word) or camelCase2.match(word):
                toDeal = splitCode(word)
            elif upperExtCase.match(word):
                toDeal = splitFinalExt(word)
            elif upperCase.match(word):
                toDeal.append(word)
            for deal in toDeal:
                if not isDelete(deal.lower()):
                    result.append(stemmer.stem(deal))
    return result

예제 #47

0

파일 보기

def extract_clean_text(json_file):
    wv = []
    cnt = 0
    stoplist = load_stoplist()
    wordnet_lemmatizer = WordNetLemmatizer()
    with open(json_file, 'r') as json_file:
        user_tweets = json.load(json_file)
        for user in user_tweets:
            text = ''
            for tweet in user_tweets[user]:
                text += common.cleanhtml(
                    common.remove_hashtag_sign(
                        common.remove_username(
                            common.remove_url(ftfy.fix_text(tweet))))) + ' '
            # clean_texts = [wordnet_lemmatizer.lemmatize(word.lower()) for word in nltk.regexp_tokenize(text, pattern)]
            clean_texts = [
                wordnet_lemmatizer.lemmatize(word.lower())
                for word in nltk.regexp_tokenize(text, pattern)
                if wordnet_lemmatizer.lemmatize(word.lower()) not in stoplist
            ]
            wv.append(clean_texts)
            cnt += 1
    logger.info('total tweets: %d;' % cnt)
    return wv

예제 #48

0

파일 보기

파일: indexator.py 프로젝트: CFLJacquet/culture_chatbot

def tf_text(text_title_summary_reviews, docID):
    """ Returns a list of filtered terms: (term, (docID, tf/sqrt(len(keywords)))) """

    pattern = r'''(?x)              # set flag to allow verbose regexps
            aujourd'hui             # exception 1
            | prud'hom\w+           # exception 2
            | \w'                   # contractions d', l', j', t', s'
            | \d+(?:,\d+)?%?€?      # currency and percentages, e.g. 12,40€, 82%        
            | (?:[A-Z]\.)+          # abbreviations, e.g. U.S.A.
            | \w+(?:-\w+)*          # words with optional internal hyphens
            #| [][.,;"'?():_`-]     # these are separate tokens; includes ], [
        '''

    words = nltk.regexp_tokenize(text_title_summary_reviews.lower(), pattern)

    keywords = []
    fdist = FreqDist()

    for elt in words:
        if elt[0] in LOADED_LEMMA:
            try:  # on prend le 1e lemma possible meme si ça peut etre faux (ex: abstrait -> abstraire (verbe))
                lemma = [x[0] for x in LOADED_LEMMA[elt[0]]
                         if x[0][0] == elt][0][1]
            except:
                with open("backend/language/lemma/missing.txt", "a") as f:
                    f.write(unidecode.unidecode(elt) + "\n")
                lemma = elt

            if not lemma in stopwords:
                keywords.append(lemma)

    fdist = FreqDist(keywords)
    result = [(x[0], (docID, (1 + log10(x[1])) / sqrt(len(keywords))))
              for x in fdist.items()]

    return result

예제 #49

0

파일 보기

파일: section.py 프로젝트: FY-KHM/Research-Paper-Concept-graph-generator

def pos(text):
    sentence_re = r'''(?x)      
	      ([A-Z])(\.[A-Z])+\.?  
	    | \w+(-\w+)*           
	    | \$?\d+(\.\d+)?%?      
	    | \.\.\.                
	    | [][.,;"'?():-_`]      
	'''

    lemmatizer = nltk.WordNetLemmatizer()
    stemmer = nltk.stem.porter.PorterStemmer()

    grammar = r"""
	    NBAR:
	        {<NN.*|JJ>*<NN.*>}  
	        
	    NP:
	        {<NBAR>}
	        {<NBAR><IN><NBAR>} 
	"""
    chunker = nltk.RegexpParser(grammar)

    toks = nltk.regexp_tokenize(text, sentence_re)
    postoks = nltk.tag.pos_tag(toks)

    #print postoks
    for word, tag in postoks:
        #print word,tag
        if (tag == "NN" or tag == "JJ"):
            count += 1
    if (count >= 2):
        print "Inside"
        return true
    #tree = chunker.parse(postoks)
    #terms=get_terms(tree)
    return false

예제 #50

0

파일 보기

파일: count_words.py 프로젝트: robintibor/tuh-eeg-auto-diagnosis

def compute_imprs_word_counts(file_names):
    texts = extract_texts(file_names)
    dicts = [extract_dict(t) for t in texts]
    imprs = [d['IMPRESSION'] for d in dicts]
    # for split
    #adapted from https://stackoverflow.com/a/22178786/1469195
    # (removed capturing groups)
    pattern = r'''(?x)               # set flag to allow verbose regexps
                  (?:[A-Z]\.)+         # abbreviations, e.g. U.S.A.
                  | \$?\d+(?:\.\d+)?%? # numbers, incl. currency and percentages
                  | \w+(?:[-']\w+)*    # words w/ optional internal hyphens/apostrophe
                  | [+/\-@&*]        # special characters with meanings
                '''
    words = regexp_tokenize("\n".join(imprs), pattern)
    words = clean_words(words)
    counter = Counter(words)
    result = namedtuple('WordResult', ['counter',
                                       'imprs',
                                       'words'], verbose=False)(
        counter=counter,
        imprs=imprs,
        words=words,
    )
    return result

예제 #51

0

파일 보기

파일: growphrase2.py 프로젝트: unix1010/SentenTree

    def __init__(self, dbfile, colText, colCnt, min_support=.01):
        timer = Timer()

        self.min_support = min_support

        dbSize = 0
        vocab = {}
        itemset = []
        texts = []
        ## load data, tokenize the text, hash vocabulary
        f = open(dbfile, 'rU')
        rdr = csv.reader(f, delimiter='\t')
        fdist = nltk.probability.FreqDist()
        for r in rdr:
            text = unicode(r[colText], 'utf-8')
            tokens = nltk.regexp_tokenize(text, tokenPattern)
            if colCnt < 0:
                num = 1
            else:
                num = int(r[colCnt])
            text = []
            for t in tokens:
                if not t in stopwords:
                    if not t in vocab:
                        vocab[t] = len(itemset)
                        itemset.append(t)
                    text.append(vocab[t])
            if len(text) > 0:
                texts.append((text, num))
            dbSize += num
        self.dbSize = dbSize
        self.vocab = vocab
        self.itemset = itemset
        self.texts = texts
        f.close()
        timer.printElapsed()

예제 #52

0

파일 보기

파일: analyze_text.py 프로젝트: JairGF05/coloreo_isomorfismo_grafos

def limpiar_texto(texto):
  '''En esta función el texto se va a tokenizar pero a partir de una expresión regular '''
  spanishstemmer=SnowballStemmer('spanish')

  pattern = r'''(?x)                 #set flag to allow verbose regexps
              (?:[A-Z]\.)+          #abbreviations, e.g. U.S.A.
              | \w+(?:-\w+)*        #words with optional internal hyphens(guiones internos)
              | \$?\d+(?:\.\d+)?%?  #currency(dinero) and percentages, e.g. $12.40, 82%
              
  ''' 
  #Definiendo stop words(palabras de interrupción)
  stop_words = set(stopwords.words('spanish'))
  #convertir a minusculas
  texto = texto.lower()  

  #aplicando tokenización por medio de la expresión regular
  texto_tokenizado = nltk.regexp_tokenize(texto,pattern)

  #quitando palabras de intrerrupción (cerradas)
  words = [w for w in texto_tokenizado if not w in stop_words]

  #convirtiendo palabras en raices
  stems = [spanishstemmer.stem(token) for token in words]
  return stems

예제 #53

0

파일 보기

파일: pos_tagger.py 프로젝트: mamonraab/arabic-ner

    def pos_tag_sent(self, sent, boi_form=True):
        """
        Tag sentence
        convert sentence from BOI-form to String-form and tag it using the provided tagger
        :param sent: boi sentence
        :param pos_tagger: tagger to tag the sentence
        :return:
        """
        if boi_form:
            untagged_sent = self.convert_from_boi_to_sent(sent)
        else:
            untagged_sent = sent

        tokens = nltk.regexp_tokenize(untagged_sent, pattern=" ", gaps=True)
        pos_tagged_sent = self.tagger.tag(tokens)

        result = []
        for i, (_, word) in enumerate(pos_tagged_sent):
            r = word.split('/')
            if len(r) == 3:
                r = ['/', r[-1]]
            result.append((tuple(r), sent[i][1]))

        return result

예제 #54

0

파일 보기

lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()

#Taken from Su Nam Kim Paper...
grammar = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
"""
chunker = nltk.RegexpParser(grammar)

toks = nltk.regexp_tokenize(text, sentence_re)
postoks = nltk.tag.pos_tag(toks)

print(postoks)

tree = chunker.parse(postoks)

from nltk.corpus import stopwords
stopwords = stopwords.words('english')


def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter=lambda t: t.label == 'NP'):
        yield subtree.leaves()

예제 #55

0

파일 보기

파일: ls1.py 프로젝트: ntabgoba/nltk1

tokens = nltk.word_tokenize(raw)
[porter.stem(t) for t in tokens]

#Lemmatization -WordNet lemmatizer 
wnl = nltk.WordNetLemmatizer()  # if you want to compile the vocabulary of some texts and want a list of valid lemmas
[wnl.lemmatize(t) for t in tokens]

#3.7 Regular Expresssions for Tokenizing Text
# simple approach - split on whitespace
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
            well without--Maybe it's always pepper that makes people hot-tempered,'..."""
re.split(r' ',raw)
re.split(r'[ \t\n]+', raw)  #matches one or more spaces i.e tabs, newlines
re.split(r'\W+', raw)  #\W split input on anything other than a word character \W = [a-zA-Z0-9]

nltk.regexp_tokenize()  # is more efficient for this task,
text = 'That U.S.A. poster-print costs $12.40..'

pattern = r'''(?x)          
    ([A-Z]\.)+          
  | \w+(-\w+)*       
  | \$?\d+(\.\d+)?%?  
  | \.\.\.            
  | [][.,;"'?():-_`]  
'''
nltk.regexp_tokenize(text, pattern)  #didnt work

#Segmentation -Sentence segemantation -word segmentation
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') 
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = sent_tokenizer.tokenize(text)

예제 #56

0

파일 보기

def split_str(line):
    words = nltk.regexp_tokenize(line, tokens_pattern)
    # print(words)
    return words

예제 #57

0

파일 보기

파일: demo9.py 프로젝트: Yixuan-Lee/COMP6751-fall-2020

import re

str = "aoweijriow aofejofenr wajioejfo er (!#^ &% etc@ 3 $ $%#) wearjwoaieraw awoerj oawier"

p = re.compile('\(.*\)')
print(p.findall(str))

print("22th".isalnum())

from nltk import regexp_tokenize
txt = "Today it's 07.May 2011. Or 2.999."
print(regexp_tokenize(txt, pattern=r'\w+([.,]\w+)*|\S+'))

예제 #58

0

파일 보기

        trainable = True

if trainable:
    words = []  # this will contain the root words
    labels = []  # this will contain the tags
    docs_patterns = []  # this will contain each pattern list
    docs_labels = [
    ]  # this will contain each tag, but many times to get the amount of the tags

    for intent in data['intents']:
        for pattern in intent['patterns']:
            # Equals the list pattern to words_pattern without punctuation marks
            # ['Hi']
            # ['How', 'are', 'you']
            # ...
            words_pattern = nltk.regexp_tokenize(pattern, "(\d+|\w+)")

            # Extend the list words with the list 'words_pattern'
            # words = ['Hi', 'How', 'are', 'you', 'Is', 'anyone', 'there', ...]
            words.extend(words_pattern)

            # Append the list 'words_pattern' to the list doc_x (not extending)
            # doc_patterns = [['Hi'], ['How', 'are', 'you'], ['Is', 'anyone', 'there']], ... ]
            docs_patterns.append(words_pattern)

            # Adding the tags to the list doc_y
            # doc_labels = ['greeting', 'greeting', 'greeting', 'greeting', 'greeting', 'goodbye', 'goodbye', ...]
            docs_labels.append(intent["tag"])

        # Append each label one time in list labels
        # labels = ['greeting', 'goodbye', 'thanks', ...]

예제 #59

0

파일 보기

 def tokenize(self, formula):
     clean_formula = re.sub(self._REGEX_CLEAN, '', formula)
     clean_formula = re.sub(self._REGEX_LETTERS, '', clean_formula)
     tokens = regexp_tokenize(clean_formula, self._REGEX_TOKEN)
     return tokens

예제 #60

0

파일 보기

파일: stem_pos_tag.py 프로젝트: Akankshya-ap/English_Odia-Translation

import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer  #stemming and lemmatizing
from nltk import regexp_tokenize  #tokenize
import pandas as pd
import csv

wr = open("stemmed_words.txt", "wr")  #write to stemmed word text file
with open('words_alpha.csv', 'r') as csvfile:  #read from csv file
    data = csv.reader(csvfile, delimiter=' ')
    print data
    stemmer1 = PorterStemmer()  #porterStemmer
    lemma = WordNetLemmatizer()  #lemmatizer
    for row1 in data:
        #print (row1)
        row1 = row1[0].replace('\n',
                               '')  #replace occurence of new line with nochar
        x = stemmer1.stem(row1)
        y = lemma.lemmatize(row1)
        row1 = regexp_tokenize(row1, "[\w']+")  #tokenizing
        z = nltk.pos_tag(row1)  #pos_tagging
        #print row1
        print z[0][0], z[0][1], x, y  #,z'''
        wr.write(z[0][0] + " " + z[0][1] + " " + x + " " + y + '\n')  #writing
csvfile.close()  #closing files
wr.close()