def getText(self):
        filename = self.textEdit.toPlainText()
        symb_remove = RegexpTokenizer(r'\w+')
        list1 = symb_remove.tokenize(filename)
        print("-----------------------------------")
        filename = self.textEdit_2.toPlainText()
        symb_remove = RegexpTokenizer(r'\w+')
        list2 = symb_remove.tokenize(filename)

        print("following is the wordlist of the file")
        print(list1)
        print(list2)

        sims = []
        initialList = []

        for word1, word2 in product(list1, list2):
            syns1 = wordnet.synsets(word1)
            print(syns1)
            syns2 = wordnet.synsets(word2)
            print(syns2)
            for word1 in syns1:
                for word2 in syns2:
                    s = word1.wup_similarity(word2)
                    if str(s) == 'None':
                        s = 0

                    initialList.append(s)
                    print(str(word1) + " second word" + str(word2))
                    print(s)
            print(initialList)
Exemplo n.º 2
0
 def get_tokenizer(self, iRegex=None):
     if iRegex is not None:
         tokenizer = RegexpTokenizer(iRegex)
     else:
         if self.language == "fr":
             tokenizer = RegexpTokenizer(r'''\w'|\w+|[^\w\s]''')
         elif self.language == "en":
             tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
         else:
             tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
     return tokenizer
Exemplo n.º 3
0
def filter_sentence(sentence):
    tokenizer = RegexpTokenizer(r'\w+')
    word_tokens = tokenizer.tokenize(sentence)

    filtered_words = [w for w in word_tokens if not w in stop_words]
    snowball_result_set = [snowball_stemmer.stem(word) for word in filtered_words]
    return snowball_result_set
Exemplo n.º 4
0
def test():
    global N, words, network

    print 'In testing.'

    gettysburg = """Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth."""
    tokenizer = RegexpTokenizer('\w+')
    gettysburg_tokens = tokenizer.tokenize(gettysburg) 

    samples = []
    for token in gettysburg_tokens:
        word = token.lower()
        if word not in ENGLISH_STOP_WORDS and word not in punctuation:
            samples.append(word)

    dist = FreqDist(samples)
    V = Vol(1, 1, N, 0.0)
    for i, word in enumerate(words):
        V.w[i] = dist.freq(word)

    pred = network.forward(V).w
    topics = []
    while len(topics) != 5:
        max_act = max(pred)
        topic_idx = pred.index(max_act)
        topic = words[topic_idx]

        if topic in gettysburg_tokens:
            topics.append(topic)
    
        del pred[topic_idx]

    print 'Topics of the Gettysburg Address:'
    print topics
def tokenize(text):
    """Generic wrapper around different tokenization methods.
    """
    text = p.clean(text)
    tokens = RegexpTokenizer(r'\w+').tokenize(text)
    tokens = [word.lower() for word in tokens if len(word) > 3 and word.isalpha()]
    return tokens
Exemplo n.º 6
0
    def __init__(self, text):
        print 'Extracting keywords...'

        self.text = text
        self.graph = defaultdict(lambda: 0)
        self.tokenizer = RegexpTokenizer('\w+')
        self.make_graph()
Exemplo n.º 7
0
def load_data():
    global N, words

    raw = list(word for fileid in corpus.fileids()
               for word in corpus.words(fileid))
    words = list(
        token
        for token in RegexpTokenizer('\w+').tokenize(' '.join(raw)))[100:1000]
    tokens = set(words)
    tokens_l = list(tokens)
    N = len(tokens)
    print 'Corpus size: {} words'.format(N)

    step = 4
    data = []
    for gram in ngrams(words, step):
        w1, w2, w3, pred = gram
        V = Vol(1, 1, N, 0.0)
        V.w[tokens_l.index(w1)] = 1
        V.w[tokens_l.index(w2)] = 1
        V.w[tokens_l.index(w3)] = 1
        label = tokens_l.index(pred)
        data.append((V, label))

    return data
Exemplo n.º 8
0
def analyze_dataset():
    l_sentences = []
    with open(
            '/Users/miljan/PycharmProjects/thesis-shared/data/pang_and_lee_data/rt-negative.txt'
    ) as file1:
        r = reader(file1, dialect='excel-tab')
        for row in r:
            l_sentences.append(row[0])
    with open(
            '/Users/miljan/PycharmProjects/thesis-shared/data/pang_and_lee_data/rt-positive.txt'
    ) as file2:
        r = reader(file2, dialect='excel-tab')
        for row in r:
            l_sentences.append(row[0])

    # chunk the given text into sentences
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    d_lengths = defaultdict(int)
    tokenizer2 = RegexpTokenizer(r'\w+')

    # clean sentences from punctuation
    l_sentences = [
        ''.join(ch for ch in sent if ch not in set(string.punctuation))
        for sent in l_sentences
    ]
    l_sentences = [len(tokenizer2.tokenize(sen)) for sen in l_sentences]
    total_sent = len(l_sentences)
    d_lengths = Counter(l_sentences)

    print total_sent
    lengths = sorted(d_lengths.iteritems(),
                     key=lambda key_value: int(key_value[0]))
    plot(lengths)
Exemplo n.º 9
0
def pre_process_text_series(data: Series,
                            tokenizer: TokenizerI = None,
                            stop_words: Set[str] = None,
                            lemmatizer: WordNetLemmatizer = None) -> List[str]:
    """
    Clean up given Series column to turn all texts to lowercase,
    remove stopwords, tokenize, and lemmatize.

    :param data: Series with text that needs to be pre-processed.
    :param tokenizer: nltk tokenizer to break text paragraph into words.
    :param stop_words: List of stop words.
    :param lemmatizer: nltk lemmatizer to reduce words to their base words.
    """
    if tokenizer is None:
        tokenizer = RegexpTokenizer(r'\w+')
    if stop_words is None:
        stop_words = set(stopwords.words('english'))
    if lemmatizer is None:
        lemmatizer = WordNetLemmatizer()
    txt = data.str.lower().str.cat(sep=' ')  # lower case
    words = tokenizer.tokenize(txt)  # tokenize
    words = [w for w in words if w not in stop_words]  # remove stop words
    words = [lemmatizer.lemmatize(w) for w in words]

    return words
Exemplo n.º 10
0
    def word_counts(data: Series,
                    stop_words: Union[Set[str], str] = 'english',
                    tokenizer: Union[TokenizerI, str] = r'\w+',
                    lemmatizer=None) -> Series:
        """
        Return a count of each word in the series of responses.

        :param data: Series containing response texts.
        :param stop_words: Set of stop words or language.
        :param tokenizer: TokenizerI or string to pass to RegexpTokenizer.
        :param lemmatizer: Optional Lemmatizer. Defaults to WordNetLemmatizer.
        """
        if isinstance(stop_words, str):
            stop_words = set(stopwords.words(stop_words))
        if isinstance(tokenizer, str):
            tokenizer = RegexpTokenizer(tokenizer)
        if lemmatizer is None:
            lemmatizer = WordNetLemmatizer()

        def process(response: str) -> List[str]:
            """
            Process a single string.
            """
            words = tokenizer.tokenize(response.lower())
            words = [w for w in words if w not in stop_words]
            words = [lemmatizer.lemmatize(w) for w in words]
            return words

        processed = data.map(process)
        word_counts = Series([
            word for _, response in processed.iteritems() for word in response
        ]).value_counts()

        return word_counts
Exemplo n.º 11
0
def prep_text_to_stem(text):
    """
    Remove partes indesejadas como números e palavras na stop_list. Além disso adicionar # ao final da
    palavra a fim de facilitar no stems de uma única letra
    :param text:
    :return:
    """
    text = list(filter(lambda x: type(x) == str, text))

    tokenizer = RegexpTokenizer(r'\w+', flags=re.UNICODE)
    tokens = tokenizer.tokenize(' '.join(text).lower())

    new_tokens = []

    stop_list = Counter(tokens).most_common(300)
    stop_list = [tup[0] for tup in stop_list]
    stop_list.append('series([],')

    for token in tokens:
        if token not in stop_list:
            token = ''.join(
                [letter for letter in token if not letter.isdigit()])
            for pun in punct:
                token.replace(pun, '')
            new_token = token + '#'
            new_tokens.append(new_token)

    return ' '.join(new_tokens)
def frequencyAnalyse(polarised_tweets : Dict):
    positive_words = {}
    negative_words = {}
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = list(stopwords.words('english'))
    for i in polarised_tweets:
        word_pit =  tokenizer.tokenize(polarised_tweets[i][0])
        tags = nltk.pos_tag(word_pit)
        for word in tags:
            if word[0] in positive_words:
                positive_words[word[0]] += 1
                continue
            elif word[0] in negative_words:
                negative_words[word[0]] += 1
                continue
            if len(word[0]) < 3:
                continue
            if word[0].lower() in stop_words:
                continue
            if word[1] in ['JJ']:
                if polarised_tweets[i][1] > 0.2:  #Positive
                    positive_words[word[0].lower()] = 1
                elif polarised_tweets[i][1] < -0.2:  #Negative
                    negative_words[word[0].lower()] = 1
    for w in sorted(negative_words, key=negative_words.get, reverse=True):
        print(w, negative_words[w])
    return (positive_words, negative_words)
Exemplo n.º 13
0
def summarize(text): 
    tokenizer = RegexpTokenizer(r'\w+')
    formatted_text = tokenizer.tokenize(text)
    sentence_list = nltk.sent_tokenize(text)
    stopwords = nltk.corpus.stopwords.words('english')
    word_frequencies = {}
    for word in formatted_text:
        if word not in stopwords:
            if word not in word_frequencies:
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1

    max_freq = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = word_frequencies[word]/max_freq
    
    sentence_scores = {}
    for sent in sentence_list:
        for word in tokenizer.tokenize(sent.lower()):
            if word in word_frequencies:
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores:
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]
    import heapq
    summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)  
    return summary
Exemplo n.º 14
0
    def add_to_index(self, document, doc_id):
        # parser = HTMLParser(text=document['data'])
        text = document['data']

        # print(1)

        nlp = Russian()
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(text)
        tokens = [token.lower() for token in tokens]
        tmp_text = ' '.join(tokens)
        if len(tokens) > 10e5:
            return
        self.doc_iter += 1
        nlp.max_length = 10e7
        doc_text = nlp(tmp_text, disable=['ner', 'parser'])
        lemmas = []
        # for lemma in tokens:
        for s in doc_text:
            lemma = s.lemma_
            lemmas.append(lemma)
            # if lemma not in set(stopwords.words('russian')) \
        #             and lemma not in set(stopwords.words('english')) \
        #             and len(lemma) > 1:
        #         lemmas.append(lemma)
        freq = FreqDist(lemmas)
        for k, v in freq.most_common():
            if k not in self.global_index:
                self.global_index[k] = []
            self.global_index[k].append((doc_id, v))
Exemplo n.º 15
0
def read_all_txt_orig(directory):
    all_s = []
    for file in os.listdir(directory):
        full_path = os.path.join(directory, file)
        if not file.endswith(".txt"):
            continue
        with open(full_path) as f:
            captions = f.read().split('\n')
            for cap in captions:
                if len(cap) == 0 or len(cap) == 1:
                    continue
                cap = cap.replace("\ufffd\ufffd", " ")
                # picks out sequences of alphanumeric characters as tokens
                # and drops everything else
                tokenizer = RegexpTokenizer(r'\w+')
                tokens = tokenizer.tokenize(cap.lower())
                # print('tokens', tokens)
                if len(tokens) == 0:
                    print('cap', cap)
                    continue

                tokens_new = []
                for t in tokens:
                    if t == 'thisbirdhasadarkgreybelly':
                        print(123)
                    t = t.encode('ascii', 'ignore').decode('ascii')
                    if len(t) > 0:
                        tokens_new.append(t)
                all_s.append(" ".join(tokens_new) + "\n")
    return all_s
Exemplo n.º 16
0
class NormalizerPipeline(object):
    tokenizer = RegexpTokenizer("\w+")
    stemmer = PorterStemmer()

    def process_item(self, meme, spider):
        text = meme["name"] + " " + meme["title"] + " " + meme["caption"]
        tokens = None
        filtered = None
        while True:
            try:
                tokens = [
                    self.stemmer.stem(token)
                    for token in self.tokenizer.tokenize(text.lower())
                ]
                filtered = set(tokens) - set(stopwords.words("english"))
            except LookupError:
                download("stopwords")
            else:
                break
        if not filtered:
            raise DropItem("No tokens found in meme")
        meme["postings"] = {key: tokens.count(key) for key in filtered}
        meme["length"] = reduce(lambda x, y: x + y,
                                meme["postings"].itervalues())
        return meme
Exemplo n.º 17
0
    def test_ww_edges_time(self):
        print("loading and parsing data, this might take a few seconds...")
        time = [datetime.now()]
        train = pd.read_csv("../../data/amazon/train.csv")

        X = train['Text'].tolist()
        cv = CountVectorizer(stop_words="english", min_df=5, max_df=0.9).fit(X)
        n_vocab = len(cv.vocabulary_)
        n_documents = len(X)
        X = jl.Parallel(n_jobs=8)(jl.delayed(lambda doc: [
            x.lower() for x in RegexpTokenizer(r"\w+").tokenize(doc)
            if x.lower() in cv.vocabulary_
        ])(doc) for doc in tqdm(X))
        max_sent_len = max(map(len, X))
        X = np.array(jl.Parallel(n_jobs=8)(
            jl.delayed(lambda doc: [cv.vocabulary_[w] for w in doc] + [-1] *
                       (max_sent_len - len(doc)))(doc) for doc in X),
                     dtype=np.int32)

        # test for the unit test, we are going down the rabbit hole
        assert X.shape == (n_documents, max_sent_len)
        time.append(datetime.now())
        print(f"loading complete!. Took {time[1] - time[0]}")
        print("starting unit test...")
        result = compute_word_word_edges(X,
                                         n_vocab,
                                         n_documents,
                                         max_sent_len,
                                         n_jobs=8)
        print(f"edge shape is {result[0].shape}")
        print(result)
        time.append(datetime.now())
        print(f"graph building took {time[2] - time[1]}")
def prepare_text(text: pd.Series) -> pd.Series:
    """
    Naive approach to text cleaning. Strip out HTML, then do relatively strict
    preparation (lemmatization, stopwords)

    :param text: series of all relevant text data
    """
    # first, remove html tags
    wo_html = text.apply(lambda x: BeautifulSoup(x, "lxml").text)

    tokenizer = RegexpTokenizer(r'\w+')
    stopword_set = set(stopwords.words('english'))
    lmtzr = WordNetLemmatizer()

    clean_text = []
    pbar = tqdm(range(len(text)), desc='clean_text')
    for d in wo_html:
        dlist = d.lower()
        dlist = tokenizer.tokenize(dlist)
        dlist = list(set(dlist).difference(stopword_set))
        # filter tokens
        filtered_tokens = []
        for token in dlist:
            if re.search('^[a-zA-Z]+$', token) and len(token) >= 4:
                filtered_tokens.append(token)
        # lemmatize
        stems = [lmtzr.lemmatize(t) for t in filtered_tokens]
        final_stems = [stem for stem in stems if len(stem) > 3]
        clean_text.append(final_stems)
        pbar.update()
    pbar.close()
    return clean_text
Exemplo n.º 19
0
def clean_text(text, stop_words):
    '''Make text lowercase, tokenize words and words with apostrophes, convert contractions to full words,
    lemmatize by POS tag, remove stop words and words shorter than 3 letters.'''
    
    # make text lowercase
    text = text.lower().replace("’", "'")

    # initial tokenization to remove non-words
    tokenizer = RegexpTokenizer("([a-z]+(?:'[a-z]+)?)")
    words = tokenizer.tokenize(text)

    # convert contractions
    contractions = load_dict_contractions()
    words = [contractions[word] if word in contractions else word for word in words]
    text = ' '.join(words)

    # remove stop words, lemmatize using POS tags, and remove two-letter words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in nltk.word_tokenize(text) \
             if word not in stop_words]
    
    # removing any words that got lemmatized into a stop word
    words = [word for word in words if word not in stop_words]
    words = [word for word in words if len(word) > 2]
    text = ' '.join(words)
    
    return text
Exemplo n.º 20
0
    def preprocessing(self):
        self.df = pd.read_csv('static/models/resampled_comments_1.csv')
        self.comments = self.df[['comment', 'rating', 'sentiment']]
        self.comments['comment'] = self.comments['comment'].map(
            lambda x: x.lower())

        toknizer = RegexpTokenizer(r'''\w'|\w+|[^\w\s]''')
        token = self.comments.apply(
            lambda row: toknizer.tokenize(row['comment']), axis=1)

        stop_words = set(stopwords.words('french'))
        stop_token = token.apply(
            lambda x: [item for item in x if item not in stop_words])

        stemmer = SnowballStemmer(language='french')
        stemm = stop_token.apply(lambda x: [stemmer.stem(y) for y in x])

        lemmatizer = FrenchLefffLemmatizer()
        lemm = stemm.apply(lambda x: [lemmatizer.lemmatize(y) for y in x])

        for i in range(len(lemm)):
            lemm[i] = ' '.join(lemm[i])

        self.comments['lemmatiser_com'] = lemm
        data = self.comments[['comment', 'lemmatiser_com', 'sentiment']]

        self.df = pd.DataFrame(data)
        return self.df
Exemplo n.º 21
0
def analyze_articles():
    json_document = _read_json_articles()
    l_articles = [
        json_document[i]['_source']['content']
        for i in range(len(json_document))
    ]

    # chunk the given text into sentences
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    d_lengths = defaultdict(int)
    tokenizer2 = RegexpTokenizer(r'\w+')
    total_sent = 0

    for article in l_articles:
        l_sentences = tokenizer.tokenize(article)
        # clean sentences from punctuation
        l_sentences = [
            ''.join(ch for ch in sent if ch not in set(string.punctuation))
            for sent in l_sentences
        ]
        l_sentences = [len(tokenizer2.tokenize(sen)) for sen in l_sentences]
        total_sent += len(l_sentences)
        d_counts = Counter(l_sentences)
        for key in d_counts.keys():
            d_lengths[str(key)] += d_counts[key]
    print total_sent
    lengths = sorted(d_lengths.iteritems(),
                     key=lambda key_value: int(key_value[0]))
    plot(lengths)
Exemplo n.º 22
0
def main(argv=None):

    english_stops = set(stopwords.words('english'))
    csvinfile = open('tweet.csv', 'rb')

    #para = "Hello World. It's good to see you. Just being here makes me jealous. 'this is in quotes', said the frog. I love to see you. I ate pies. I ate pies."
    filter_stops = lambda w: len(w) < 3 or w in english_stops

    bcf = collocations.BigramCollocationFinder

    documents = []
    reader = csv.DictReader(csvinfile)
    for row in reader:

        token_sentence = RegexpTokenizer("[@]?[\w]+('\w)*|[\S]").tokenize(
            row["text"].lower())
        documents.append(token_sentence)
        print token_sentence
        #print pos_tag(token_sentence)
        #for word in token_sentence:
        #if word not in english_stops:
        #print word

    bcf2 = bcf.from_documents(documents)
    bcf2.apply_word_filter(filter_stops)
    print bcf2.nbest(metrics.BigramAssocMeasures.likelihood_ratio, 15)
    return
Exemplo n.º 23
0
    def tokenize(self, attr):
        accepted = {
            'title': self.title,
            'description': self.description,
            'cve': self.cve,
            'cwe': self.cwe,
            'refs': self.refs,
            'dsk': self.dsk
        }
        matcher = {
            'title': r'\w+[-\w+]*',
            'description': r'\w+[-\w+]*',
            'cve': r'CVE[\s|-]\d+[\s|-]\d+'
        }

        if attr not in accepted.keys():
            return 'It is not possible to tokenize this plugin attribute.'

        tokenizer = RegexpTokenizer(matcher[attr])
        stop = stopwords.words('english')
        final = []
        if attr == 'title' or attr == 'description':
            intermediate = tokenizer.tokenize(accepted[attr])
            final = [i.lower() for i in intermediate if i not in stop]
        elif attr == 'cve':
            intermediate = tokenizer.tokenize(','.join(accepted[attr]))
            final = [i.lower().replace(' ', '-') for i in intermediate if i not in stop]

        return final
    def processText(self,Estr):
        # ① 去除HTML标签
        content = re.sub(r'<[^>]*>', ' ', Estr)

        # ② 除去标点符号,等非字母的字符
        tokenizer = RegexpTokenizer(r'[a-z]+')
        raw = str(content).lower()
        content = tokenizer.tokenize(raw)

        # ③ 去除停用词
        # 获取英语的停用词表
        en_stop = stopwords.words('english')  # get_stop_words('en')
        # 获取自己的停用词表
        # file = os.getcwd()+"\\..\\datasets\\stopwords.txt"
        # f = open(file, "r")
        # mystopwords = f.read()
        # mystopwords= mystopwords.split('\n')
        # for word in mystopwords:
        #     en_stop.add(word)
        # 去除文本中的停用词
        stopped_tokens = [i for i in content if not i in en_stop]

        # ④ 按长度过滤
        content = [i for i in stopped_tokens if len(i) > 2]

        return content
Exemplo n.º 25
0
        def clicked():
            res = self.Scrolledtext1.get(1.0, "end-1c")

            commentLijst = []
            commentLijst.append(res)

            vertalingsLijst = noiseRemoval(commentLijst)

            tokenizer = RegexpTokenizer(r'\w+')

            for y in range(len(vertalingsLijst)):

                print("\nRating...")
                score2 = rate(vertalingsLijst[y])
                print(vertalingsLijst[y])
                print(score2)

                if score2 > 0:
                    self.Label1.configure(text="Positive")
                    self.Label1.config(fg="green")

                elif score2 is 0:
                    self.Label1.configure(text="Neutral")
                    self.Label1.config(fg="gray")

                else:
                    self.Label1.configure(text="Negative")
                    self.Label1.config(fg="red")
Exemplo n.º 26
0
    def tokenize(self, string):
        # Supression des espaces non nécessaires
        space = re.compile(r' +')
        string = re.sub(space, ' ', string)

        # Harmonisation des numéros de téléphone
        tel = re.compile(
            r'(?P<sep1>0[0-9])( |/+|\-|\\+)(?P<sep2>[0-9]{2})( |/+|\.|\-|\\+)(?P<sep3>[0-9]{2})( |/+|\.|\-|\\+)(?P<sep4>[0-9]{2})( |/+|\.|\-|\\+)(?P<sep5>[0-9]{2})'
        )
        string = tel.sub(r'\g<sep1>.\g<sep2>.\g<sep3>.\g<sep4>.\g<sep5>',
                         string)

        # Tokenisation
        # Le tokenizer supprime automatiquement les caractères suivant isolés : `^ ° ¤ ¨
        # Reconnait comme token :
        # - Email
        # - Site web, nom de domaine, utilisateur etc
        # - Numéro de téléphone réduit
        # - Nom composé
        # - Mot courant
        # - Ponctuation
        tokenizer = RegexpTokenizer(
            r'''([Aa]ujourd'hui|\w+'|[a-zA-ZÀ-Ÿà-ÿ0-9_\.\-]+@[a-zA-ZÀ-Ÿà-ÿ0-9\-\.]+\.[a-zA-ZÀ-Ÿà-ÿ0-9]+|[a-zA-ZÀ-Ÿà-ÿ0-9:@%/;$~_?\+\-=\\\.&\|£€]+[a-zA-ZÀ-Ÿà-ÿ0-9#@%/$~_?\+\-=\\&\|£€]+|[\wÀ-Ÿà-ÿ]+[/\-][\wÀ-Ÿà-ÿ]+|[\wÀ-Ÿà-ÿ0-9]+|\.\.\.|[\(\)\[\]\{\}\"\'\.,;\:\?!\-\_\*\#\§=+<>/\\])'''
        )
        tokens = tokenizer.tokenize(string)
        return tokens
    def __init__(self, bioasq_json, context_token_limit=-1,
                 types=None, include_synonyms=False, include_answer_spans=True):
        """
        Creates the BioAsqSquadBuilder.
        :param bioasq_json: The BioASQ JSON object.
        :param context_token_limit: If larger than 0, contexts will only be
                added as long as the token limit is not exceeded.
        :param types: Question types to include
        :param include_synonyms: If True, the answers object is a list of lists
                (which is NOT the SQuAD format) with the outer list containing
                the answers (i.e., correct answers of the list question) and
                inner list containing the synonyms. If False, the answers object
                is a flat list and only one synonym is included.
        :param include_answer_spans: Whether to include exact answers. If True,
                questions that are not extractive are skipped.
        """

        self._bioasq_json = bioasq_json
        self._types = types
        if self._types is None:
            self._types = ["factoid", "list"]

        self._tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
        self._context_token_limit = context_token_limit
        self._include_synonyms = include_synonyms
        self._include_answer_spans = include_answer_spans
        self._paragraphs = None
        self._stats = {
            "contexts_truncated": 0,
            "max_context_length": 0,
        }
Exemplo n.º 28
0
 def cleanreview(t):
     t = t.lower()
     t = RegexpTokenizer(r'[a-zA-Z]+').tokenize(t)
     t = [x for x in t if x not in stop_words]
     t = [lemmatizer.lemmatize(x, pos = "v") for x in t ]
     t = " ".join(t)
     return t
Exemplo n.º 29
0
def clean_text(t):
    sentence = t.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = filter(
        lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered_words)
    def tensor_vec_pipline(data, word_index, max_len):
        
#Create data maxtrix to be fed to the keras model       
        print("Creating data to feed to tensorflow")
        df_len = len(data)
        indexing_matrix = np.zeros((df_len, max_len), dtype = 'int32')
        r_inc = 0

        tokenizer = RegexpTokenizer(r'\w+')

        for index, row in data.iterrows():
    
            sentence = row['sentence']
            sen_tokenize = tokenizer.tokenize(sentence)
    
            c_inc = 0
            
            for word in sen_tokenize:
                try:
                    indexing_matrix[r_inc][c_inc] = word_index[word]
                except Exception as e:
                    #print(e, word)
                    if (str(e) == word):
                        indexing_matrix[r_inc][c_inc] = 0
                continue
        
            c_inc = c_inc + 1
        r_inc = r_inc + 1
        print("Run complete")
        
        return indexing_matrix