예제 #1
1
def parseTextToSentences(text):
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    data = text
    data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')

    sentences = []
    for para in data.split('\n'):
        if para:
            sentences.extend(sentence_splitter.tokenize(para))
    return sentences
예제 #2
0
def analyzer_results():
    essay = request.form.get('essay')
    similarity_nmf = request.form.get('similarity_nmf')
    similarity_tfidf = request.form.get('similarity_tfidf')

    # linebreak_idx = [m.start() for m in re.finditer('\n', essay)]
    s_tokenizer = PunktSentenceTokenizer()
    sentences = s_tokenizer.tokenize(essay)
    top_sentences = textrank.summarize(essay).split('\n')
    top_idx = []
    for i,sentence in enumerate(sentences):
        if sentence in top_sentences:
            top_idx.append(i)
    # Retokenize to get punctuation marks back
    sentences = s_tokenizer.tokenize(essay)
    sentences = list(enumerate(sentences))

    topics,similar_essays = processEssay(essay, similarity_nmf, similarity_tfidf, json_output=False)
    essay1 = similar_essays[0]
    essay2 = similar_essays[1]
    essay3 = similar_essays[2]
    topic1, topic2, topic3, topic4, topic5, topic6, topic7 = topics
    topic_names = ['Family', 'Music', 'Culture', 'Sport', 'Personal/Story', 'Science', 'Career']
    topic_tuples = zip(topic_names, topics)

    # Load interactive plot
    interactive_plot = interactivePlot()

    return render_template('analyzer_results.html', essay1 = essay1, essay2 = essay2, essay3 = essay3, topic_tuples = topic_tuples, sentences=sentences, top_idx=top_idx, interactive_plot=interactive_plot)
예제 #3
0
def summarize_pdf(article_text):
    
    trainer=PunktTrainer()
    trainer.train(article_text)
    tok=PunktSentenceTokenizer(trainer.get_params())
    sentence_list = tok.tokenize(article_text)
    sentence_lists=[]
    sent_list=[]

    clean_sent=[]
    for sent in sentence_list:
            tok=TreebankWordTokenizer()
            words=tok.tokenize(sent)
            wordss=[]
            words=[ww.lower() for ww in words]
            sentence_lists.append(" ".join(words))
            for word,tag in pos_tag(words):
                if tag.startswith('NN'):
                    pos='n'
                elif tag.startswith('VB'):
                    pos='v'
                
                elif tag.startswith('RB'):
                    pos='r'
                else:
                    pos='a'
                stem=WordNetLemmatizer()
                w=stem.lemmatize(word,pos)
                if(w not in punc) & bool(re.search("[^\d]",w)):
                    wordss.append(w.lower())
            clean_sent.append(' '.join(wordss))    
            sent_list.append(wordss)
    return sent_list,clean_sent,sentence_lists,sentence_list
예제 #4
0
def split_sentences_to_strings(input_file, name="Movie"):
    """
    Transforme a plane text formation into a list of strings, each string a sentence.
    """
    data = []
    text = str()
    with open(input_file, newline='', encoding="utf-8") as in_file:
        reader = csv.reader(in_file, delimiter=',', quotechar='"')
        next(reader)  #skip header
        for row in reader:
            if name == "Movie":
                text += row[2]
            elif name == "Financial":
                text += row[3]
    sent_detector = PunktSentenceTokenizer(train_text=text)
    with open(input_file, newline='', encoding="utf-8") as in_file:
        reader = csv.reader(in_file, delimiter=',', quotechar='"')
        next(reader)  #skip header
        for row in reader:
            cleaned_sentences = []
            if name == "Movie":
                sentences = sent_detector.tokenize(row[2].strip())
            elif name == "Financial":
                sentences = sent_detector.tokenize(row[3].strip())
            for sentence in sentences:
                words = wordpunct_tokenize(sentence)
                words_out = []
                for word in words:
                    words_out.append(clean_word(word))
                cleaned_sentence = " ".join(words_out)
                cleaned_sentence = wordpunct_tokenize(
                    cleaned_sentence)  #remove double space
                cleaned_sentence = " ".join(cleaned_sentence)

                cleaned_sentence = wordpunct_tokenize(
                    cleaned_sentence)  #remove double NUMBER token
                last_len = 0
                while last_len != len(cleaned_sentence):
                    last_len = len(cleaned_sentence)
                    for index, word in enumerate(cleaned_sentence):
                        if word == "NUMBER":
                            if index + 1 < len(cleaned_sentence):
                                if cleaned_sentence[index + 1] == "NUMBER":
                                    del (cleaned_sentence[index + 1])

                cleaned_sentence = " ".join(cleaned_sentence)
                cleaned_sentences.append(cleaned_sentence)

            if name == "Movie":
                data.append([cleaned_sentences, row[1]])
            elif name == "Financial":
                data.append([cleaned_sentences, row[4], row[3]])
    return data
예제 #5
0
class PunktSplitter(ISplitter):
    """
    A splitter using the
    `PunktSentenceTokenizer <https://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.punkt>`_, the NLTK
    implementation of the "Unsupervised Multilingual Sentence Boundary Detection (Kiss and Strunk (2005)" algorithm.

    .. note::
        The default implementation uses a model trained on English sentences.
        `This kaggle resource <https://www.kaggle.com/nltkdata/punkt/version/2#>`_ offers
        pretrained Punkt Models for other languages as well, including German. In my tests though, German models
        performed poorly compared to the default...

    .. todo::
        Train a Punkt model for Swiss-German.
        (https://stackoverflow.com/questions/21160310/training-data-format-for-nltk-punkt)
    """
    def __init__(self, modelfile=None):
        if modelfile is not None:
            with open(modelfile, 'rb') as f:
                self.tokenizer = pickle.load(f)
        else:
            self.tokenizer = PunktSentenceTokenizer()

    def split(self, text: str) -> List[str]:
        """ Split text using Punkt. """
        paragraphs = (p for p in text.split('\n') if p)
        sentences = []
        for p in paragraphs:
            sentences.extend(self.tokenizer.tokenize(p))
        return sentences
예제 #6
0
def summarize(text, ref='', lines=7):
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    clean_text = text.lower()
    clean_text = re.sub(r'\W', ' ', clean_text)
    clean_text = re.sub(r'\d', ' ', clean_text)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'i.e'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    text = text.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')
    sentences = sentence_splitter.tokenize(text)
    #sentences = nltk.sent_tokenize(text)
    stop_words = nltk.corpus.stopwords.words('english')

    word_count = {}
    for word in nltk.word_tokenize(clean_text):
        if word not in stop_words:
            word_count[word] = word_count.get(word, 0) + 1

    sentence_score = {}
    i = 0
    for s in sentences:
        for word in nltk.word_tokenize(s.lower()):
            if word in word_count.keys():
                old = sentence_score.get(s, (0, 0, i))
                i += 1
                sentence_score[s] = (old[0] + word_count[word], old[1] + 1,
                                     old[2])

    def score(pair):
        return (pair[0] - pair[2]) / pair[1]

    scores = {}
    for s in sentence_score.keys():
        if sentence_score[s][1] > 2:
            scores[s] = score(sentence_score[s])
        else:
            scores[s] = score(sentence_score[s]) - 100

    best_sentences = heapq.nlargest(lines, scores, key=scores.get)
    best_sentences.sort(key=lambda x: sentence_score[x][2])

    string = ''

    for s in best_sentences:
        if s[0] == ' ':
            s = s[1:]
        if 'refer' in s and len(scores.keys()) < 4:
            print('Please be more specific\n')
            if len(ref) > 1:
                print('Here are some suggestions:')
            for i in range(len(ref)):
                print("=>", ref[i])
            print('\n')
            return
        print(s)
        string += s + '\n'
    return string
예제 #7
0
def get_todo_items(text):
    all_items = list()
    tokenizer = PunktSentenceTokenizer()
    sen_tokens = tokenizer.tokenize(text)

    for sen_token in sen_tokens:
        todo_items = list()
        tokens = nltk.word_tokenize(sen_token)

        tags = tagger.tag(tokens)
        stop_words = [word for (word, tag) in tags if tag in (tagVB, tagVBP)]

        ind = -1
        for word in stop_words:
            curr_ind = tokens.index(word)
            if curr_ind != 0 and tags[curr_ind - 1][1] in (tagCC, tagRB):
                to_ind = curr_ind - 1
            else: to_ind = curr_ind
            if ind != -1 and abs(to_ind - ind) > 1:
                todo_items.append(' '.join(tokens[ind:get_punctuation_index(tokens, ind, to_ind)]))
            elif ind != -1 and len(todo_items) > 0:
                last_ind = len(todo_items)
                todo_items[last_ind - 1] = ' '.join([todo_items[last_ind - 1], tokens[to_ind - 1]])
            ind = curr_ind

        if ind != -1 and abs(len(tokens) - ind) > 1:
            todo_items.append(' '.join(tokens[ind:get_punctuation_index(tokens, ind, len(tokens))]))
        elif ind != -1 and len(todo_items) > 0:
            last_ind = len(todo_items)
            todo_items[last_ind - 1] = ' '.join([todo_items[last_ind - 1], tokens[len(tokens) - 1]])

        all_items.extend(todo_items)

    return all_items
 def fractal_representation(self):
     punkt_param = PunktParameters()
     for each_paragraph in self.paragraphs:
         buffer_p = paragraph()
         buffer_p.paragraph = each_paragraph
         buffer_p.tokens = nltk.word_tokenize(preprocess(each_paragraph))
         buffer_p.weights['words'] = FreqDist(buffer_p.tokens)
         buffer_p.weights['total'] = {'words':0, 'sentences':0}    
         punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
         sentence_splitter = PunktSentenceTokenizer(punkt_param)
         sentences = sentence_splitter.tokenize(each_paragraph)
         for each_sentence in sentences:
             self.stotal += 1
             buffer_s = sentence()
             buffer_s.sentence = each_sentence
             buffer_s.tokens = nltk.word_tokenize(preprocess(each_sentence))
             if len(buffer_s.tokens) > 0:
                 buffer_s.weights['sentence'] = FreqDist(buffer_s.tokens)
                 buffer_s.weights['paragraph'] = self.calculate_relative_frequence(buffer_s.tokens, buffer_p.weights['words'])
                 buffer_s.weights['document'] = self.calculate_relative_frequence(buffer_s.tokens, self.fractal.weights)
                 buffer_s.weights['total'] = {}
                 buffer_s.weights['total']['sentence'] = 1
                 buffer_s.weights['total']['paragraph'] = sum(buffer_s.weights['paragraph'].values())
                 buffer_s.weights['total']['document'] = sum(buffer_s.weights['document'].values())
                 self.s_weight += buffer_s.weights['total']['document']
                 buffer_p.weights['total']['sentences'] += buffer_s.weights['total']['document']
                 buffer_p.sentences.append(buffer_s)
         buffer_p.weights['total']['words'] = sum(buffer_p.weights['words'].values())
         self.fractal.paragraphs.append(buffer_p)
         self.pindex += 1
 def summarize(self):
     punkt_param = PunktParameters()
     punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
     sentence_splitter = PunktSentenceTokenizer(punkt_param)
     sentences = sentence_splitter.tokenize(self.text)
     structure = {}
     sentence_objects = []
     for idx in range(len(sentences)):
         obj = {'text' : sentences[idx], 'index' : idx , 'data': {}}
         sentence_objects.append(obj)
     structure['sentences'] = sentence_objects
     self.sentencecount = len(structure['sentences'])
     structure['ordered'] = []
     structure['weights'] = {'words' : FreqDist(nltk.word_tokenize(preprocess(self.text))), 'total': 0, 'transformed': 0}
     structure['weights']['total'] = sum(structure['weights']['words'].values())
     self.sentenceIndex = 0
     for each_sent in structure['sentences']:
         each_sent['data']['tokens'] = nltk.word_tokenize(preprocess(each_sent['text']))
         each_sent['data']['sinTransform'] = (1-math.sin(self.sentenceIndex*(math.pi/self.sentencecount)))+1
         for each_word in structure['weights']['words']:
             if each_word in each_sent['data']['tokens']:
                 structure['weights']['words'][each_word] *= each_sent['data']['sinTransform']
         self.sentenceIndex += 1
     structure['weights']['transformed'] = sum(structure['weights']['words'].values())
     self.sentenceIndex = 0
     for each_sent in structure['sentences']:
         each_sent['data']['weights'] = {'words': self.calculate_relative_frequence(each_sent['data']['tokens'], structure['weights']['words']), 'total': 0}
         each_sent['data']['weights']['total'] = sum(each_sent['data']['weights']['words'].values())
         self.sentenceIndex += 1
     structure['ordered'] = sorted(structure['sentences'], key=lambda x:x['data']['weights']['total'], reverse=True)
     structure_keep = structure['ordered'][:self.quota]
     structure_keep.sort(key=lambda x:x['index'])
     for eac_sen in structure_keep:
         self.summary.append(eac_sen['text'])
예제 #10
0
    def _create_data(self):
        if self.split == 'train':
            self._create_vocab()
        else:
            self._load_vocab()
        tokenizer = PunktSentenceTokenizer(preserve_case=False)
        data = defaultdict(dict)
        with open(self.raw_data_path, 'r') as file:
            for i, line in enumerate(file):
                words = tokenizer.tokenize(line)
                input = ['<sos>'] + words
                input = input[:self.max_sequence_length]
                target = words[:self.max_sequence_length - 1]
                target = target + ['<eos>']

                assert len(input) == len(target), "%i, %i" % (len(input),
                                                              len(target))
                length = len(input)

                input.extend(['<pad>'] * (self.max_sequence_length - length))
                target.extend(['<pad>'] * (self.max_sequence_length - length))
                input = [self.w2i.get(w, self.w2i['<unk>']) for w in input]
                target = [self.w2i.get(w, self.w2i['<unk>']) for w in target]
                id = len(data)
                data[id]['input'] = input
                data[id]['target'] = target
                data[id]['length'] = length

        with io.open(os.path.join(self.data_dir, self.data_file),
                     'wb') as data_file:
            data = json.dumps(data, ensure_ascii=False)
            data_file.write(data.encode('utf8', 'replace'))
        self._load_data(vocab=False)
예제 #11
0
    def _create_vocab(self):
        assert self.split == 'train', "Vocabulary can only be created for training file."
        tokenizer = PunktSentenceTokenizer(preserve_case=False)
        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()
        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)
        with open(self.raw_data_path, 'r') as file:
            for i, line in enumerate(file):
                words = tokenizer.tokenize(line)
                w2c.update(words)
            for w, c in w2c.items():
                if c > self.min_occ and w not in special_tokens:
                    i2w[len(w2i)] = w
                    w2i[w] = len(w2i)
        assert len(w2i) == len(i2w)

        print("Vocabulary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.data_dir, self.vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))
        self._load_vocab()
def tokenSentence(s):
    tokenizer = PunktSentenceTokenizer()
    tokenizer.train(s)
    l = tokenizer.tokenize(s)
    s = '\n'.join(l)

    return s
예제 #13
0
def read_docx(path):
    """read .docx (Microsoft 2007+)
    """
    try:
        doc = docx.Document(path)

        punkt_param = PunktParameters()
        punkt_param.abbrev_types = set(['fig'])
        tokenizer = PunktSentenceTokenizer(punkt_param)

        body = []
        for p in doc.paragraphs:
            body += tokenizer.tokenize(clean_text(p.text))
        body = '\n'.join(body)

        tables = []
        for t in doc.tables:
            table = {'cells': []}
            for row in t.rows:
                row_elements = []
                for cell in row.cells:
                    for p in cell.paragraphs:
                        row_elements.append({'text': clean_text(p.text)})
                table['cells'].append(row_elements)
            tables.append(table)

        data = PaperData(body, tables)
    except Exception:
        logger.info('fail: %s', path)
        traceback.print_exc()
        return PaperData()

    return data
예제 #14
0
def retrieveUrlText(url):
    try:
        config = Config()
        config.request_timeout = 1000
        config.memoize_articles = False
        config.fetch_images = False
        config.browser_user_agent = 'Mozilla/5.0'
        article = Article(url, config)
        article.download(recursion_counter=5)
        if article.download_state != 2:
            return ''
        article.parse()
        articleText = article.text.replace('\n', ' ')
    except KeyboardInterrupt:
        raise
    except Exception:
        return ''
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set([
        'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'fig', 'figs',
        'chem', 'ph'
    ])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    articleSentences = validateSentences(
        sentence_splitter.tokenize(articleText))
    return articleSentences
예제 #15
0
 def _split_sentences(self, text):
     from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
     punkt_param = PunktParameters()
     punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
     sentence_splitter = PunktSentenceTokenizer(punkt_param)
     sentences = sentence_splitter.tokenize(text)
     return sentences
예제 #16
0
파일: sentence.py 프로젝트: cltk/cltk
    def tokenize_sentences(self, untokenized_string: str):
        """Tokenize sentences by reading trained tokenizer and invoking
        ``PunktSentenceTokenizer()``.
        :type untokenized_string: str
        :param untokenized_string: A string containing one of more sentences.
        :rtype : list of strings
        """
        # load tokenizer
        assert isinstance(untokenized_string, str), \
            'Incoming argument must be a string.'

        if self.language == 'latin':
            tokenizer = super()
        elif self.language == 'greek': # Workaround for regex tokenizer
            self.sent_end_chars=GreekLanguageVars.sent_end_chars
            self.sent_end_chars_regex = '|'.join(self.sent_end_chars)
            self.pattern = rf'(?<=[{self.sent_end_chars_regex}])\s'
        elif self.language in INDIAN_LANGUAGES:
            self.sent_end_chars=SanskritLanguageVars.sent_end_chars
            self.sent_end_chars_regex = '|'.join(self.sent_end_chars)
            self.pattern = rf'(?<=[{self.sent_end_chars_regex}])\s'
        else:
            # Warn that NLTK Punkt is being used by default???
            tokenizer = PunktSentenceTokenizer()

        # mk list of tokenized sentences
        if self.language == 'greek' or self.language in INDIAN_LANGUAGES:
            return re.split(self.pattern, untokenized_string)
        else:
            return tokenizer.tokenize(untokenized_string)
예제 #17
0
파일: textrank.py 프로젝트: ko/random
def textrank(document):
    pst = PunktSentenceTokenizer()
    sentences = pst.tokenize(document)

    # Bag of Words
    from sklearn.feature_extraction.text import CountVectorizer
    cv = CountVectorizer()
    bow_matrix = cv.fit_transform(sentences)

    from sklearn.feature_extraction.text import TfidfTransformer
    normalized_matrix = TfidfTransformer().fit_transform(bow_matrix)

    ## mirrored matrix where the rows and columns correspond to 
    ## sentences, and the elements describe how similar the
    ## sentences are. score 1 means sentences are exactly the same.
    similarity_graph = normalized_matrix * normalized_matrix.T
    similarity_graph.toarray()

    # PageRank
    import networkx as nx
    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)

    ## mapping of sentence indices to scores. use them to associate
    ## back to the original sentences and sort them
    scores = nx.pagerank(nx_graph)
    ranked = sorted(((scores[i], s) for i,s in enumerate(sentences)), reverse=True)
    print ranked[0][1]
예제 #18
0
def clean(text):
    # Returns cleaned, tokenized documents from raw HTML text.

    text = cleanmyhtml(url)

    # We need to remove things like (R-NE). There are some wacky abbreviations
    # for states, but all fall under five.
    text = re.sub(r'\w{1}\-\w{1,5}\.', '', text)

    # U.S. needs to become US or else it'll tokenize weirdly. Same with
    # H.R. (house resolution).
    text = re.sub(r'U\.S\.', 'US', text)
    text = re.sub(r'H\.R\.', 'HR', text)

    # NLTK is pretty poor at tokenizing sentences that contain ." or .'
    # We'll insert a space into these.

    text = re.sub(r'\.\"', '. \"', text)
    text = re.sub(r'\"\.', '. \'', text)

    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set([
        'dr', 'reps', 'Reps', 'H.R', 'h.r', 'hr', 'HR', 'vs', 'mr', 'ms',
        'pres,', 'mrs', 'prof', 'inc', 'sens', 'Sens', 'Sen', 'sen'
    ])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    sentences = sentence_splitter.tokenize(text)
    return (sentences)
예제 #19
0
    def __getlemmas(self, txt):
        '''
            Filters noun, adjective and verb from input text, lemmatize them 
            and returns as list of words(tokens)
            
            Parameters:
                @txt  : The text file (str format) which must be lemmatized
        '''

        lemma = WordNetLemmatizer()
        punkts = PunktParameters()
        punkts.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
        sent_tokenizer = PunktSentenceTokenizer(punkts)
        sentences = sent_tokenizer.tokenize(txt)

        lemma_tokens = []
        for sentence in sentences:
            stoken = word_tokenize(sentence)
            pos_sent = pos_tag(stoken)

            for p in pos_sent:
                if p[1].startswith('N'):
                    pos = wordnet.NOUN
                elif p[1].startswith('J'):
                    pos = wordnet.ADJ
                elif p[1].startswith('V'):
                    pos = wordnet.VERB
                else:
                    pos = None

                if pos:
                    lemma_tokens.append(lemma.lemmatize(p[0].lower(), pos))

        return lemma_tokens
예제 #20
0
    def tokenize_sentences(self, untokenized_string: str):
        """Tokenize sentences by reading trained tokenizer and invoking
        ``PunktSentenceTokenizer()``.
        :type untokenized_string: str
        :param untokenized_string: A string containing one of more sentences.
        :rtype : list of strings
        """
        # load tokenizer
        assert isinstance(untokenized_string,
                          str), "Incoming argument must be a string."

        if self.language == "latin":
            tokenizer = super()
        elif self.language == "greek":  # Workaround for regex tokenizer
            self.sent_end_chars = GreekLanguageVars.sent_end_chars
            self.sent_end_chars_regex = "|".join(self.sent_end_chars)
            self.pattern = rf"(?<=[{self.sent_end_chars_regex}])\s"
        elif self.language in INDIAN_LANGUAGES:
            self.sent_end_chars = SanskritLanguageVars.sent_end_chars
            self.sent_end_chars_regex = "|".join(self.sent_end_chars)
            self.pattern = rf"(?<=[{self.sent_end_chars_regex}])\s"
        else:
            # Warn that NLTK Punkt is being used by default???
            tokenizer = PunktSentenceTokenizer()

        # mk list of tokenized sentences
        if self.language == "greek" or self.language in INDIAN_LANGUAGES:
            return re.split(self.pattern, untokenized_string)
        else:
            return tokenizer.tokenize(untokenized_string)
예제 #21
0
def nmf(document):

    #分句
    sentence_tokenizer=PunktSentenceTokenizer()
    sentences=sentence_tokenizer.tokenize(document)

    #计算词频
    c=CountVectorizer()

    #计算tf-idf
    bow_matrix=c.fit_transform(sentences)
    normalized=TfidfTransformer().fit_transform(bow_matrix)


    #获取词袋模型中所有词语
    all_words=(c.get_feature_names())

    #index2word
    index2words = {v: k for k, v in c.vocabulary_.items()}

    nmf=NMF(n_components=2,random_state=27,alpha=0.1,l1_ratio=0.5).fit(normalized)

    # 权重矩阵
    w = nmf.transform(normalized)

    #特征矩阵
    f=nmf.components_.shape
예제 #22
0
def plagiarismChecker():
    text = request.form['text_to_check']
    if (text.lstrip().rstrip() == ''):
        return render_template('input.html')
    punkt_parameters = PunktParameters()
    sentence_splitter = PunktSentenceTokenizer(punkt_parameters)
    sentences = sentence_splitter.tokenize(text)
    probability_of_plagiarism = 0
    for a_sentence in sentences:
        time.sleep(0.1)
        content = list(filter(lambda x: x in string.printable, a_sentence))
        str1 = ''.join(content)
        print(str1)
        # temp=list(content)
        # print(str(temp))
        the_term = urllib.parse.quote('+' + '"' + str1 + '"')
        page = requests.get('https://www.bing.com/search?q=' + the_term)
        print(page.url)
        if ((not "There are no results for" in page.text)
                and (not "No hay resultados para" in page.text)
                and (not "are no results for" in page.text)):
            probability_of_plagiarism += 1
    percent_plagiarised = str(
        (probability_of_plagiarism / len(sentences)) * 100) + '%'
    return render_template('results.html',
                           text=text,
                           percent_plagiarised=percent_plagiarised)
예제 #23
0
def rank_sentences(text, sentence_scores, title="", n=7):

    final_sentences = []

    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.train(text)
    sent_tokenizer = PunktSentenceTokenizer(trainer.get_params())

    for s in sentence_scores:
        if title == "":
            break
        else:
            sentence_scores[s] *= (1 + similarity_score(title, s))

    sc = sentence_scores.copy()
    sc = OrderedDict(sorted(sc.items(), key=lambda t: t[1], reverse=True))
    ordered_sents = dict(islice(sc.items(), n))

    proper_sentences = sent_tokenizer.tokenize(text)

    for s in proper_sentences:
        if s.lower() in ordered_sents:
            final_sentences.append(s)

    return final_sentences
    def article_sentences(self, article_text):  # take in article.text

        document = ' '.join(article_text.strip().split('\n'))
        sentence_tokenizer = PunktSentenceTokenizer()
        sentences = sentence_tokenizer.tokenize(document)

        return sentences
예제 #25
0
 def tokenise(self, sample):
     # first pass - look for poems
     verses = self.scan_for_verse(sample)
     if verses:
         self.notes.append("got {} verses".format(len(verses)))
         verses = [ re.sub(r'\[\d+\]', '', v) for v in verses ]
     else:
         verses = []
     # second pass - look for sentences
     text = re.sub(r'\[\d+\]', '', sample)
     text = re.sub("\r\n", ' ', text)
     punkt_param = PunktParameters()
     punkt_param.abbrev_types = set(self.cf['abbreviations'])
     tokenizer = PunktSentenceTokenizer(punkt_param)
     sentences = tokenizer.tokenize(text)
     sentences = sentences[1:-1]
     self.notes.append("got {} sentences".format(len(sentences)))
     # remove any sentences which we already found as part of verses
     for s in sentences:
         matches = [ v for v in verses if s[:SENTENCE_MATCH] in v ]
         if matches:
             self.notes.append("found sentence {} in verses {}".format(s, matches))
             sentences.remove(s)
     verses.extend(sentences)
     return verses
예제 #26
0
    def get_key_sentences(self, n=5):
        '''
        Uses a simple implementation of TextRank to extract the top N sentences
        from a document.

        Sources:
        - Original paper: http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Mihalcea.pdf
        - Super useful blog post: http://joshbohde.com/blog/document-summarization
        - Wikipedia: http://en.wikipedia.org/wiki/Automatic_summarization#Unsupervised_keyphrase_extraction:_TextRank
        '''
        # Tokenize the document into sentences. More NLP preprocesing should also happen here. 
        sentence_tokenizer = PunktSentenceTokenizer()
        sentences = sentence_tokenizer.tokenize(self.doc)

        # Calculate word counts and TFIDF vectors
        word_counts = CountVectorizer(min_df=0).fit_transform(sentences)
        normalized = TfidfTransformer().fit_transform(word_counts) 

        # Normalized graph * its transpose yields a sentence-level similarity matrix
        similarity_graph = normalized * normalized.T
     
        nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
        scores = nx.pagerank(nx_graph)
        return sorted(((scores[i],s) for i,s in enumerate(sentences)),
                      reverse=True)[n]
예제 #27
0
def getSentences(text):
	#returns a list of sentences tokenized by Punkt
	punkt_param = PunktParameters()
	punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
	sentence_splitter = PunktSentenceTokenizer(punkt_param)
	sentences = sentence_splitter.tokenize(text)
	return sentences
예제 #28
0
def process(text, word_sets_folder="algorithms/data/word_sets"):
    word_sets = import_word_sets(word_sets_folder)

    nltk_model_file = open('algorithms/data/NLTK_model_data/model.txt', 'rb')
    trained = pickle.load(nltk_model_file)

    sentence_tokenizer = PunktSentenceTokenizer(trained.get_params())

    text = sentence_tokenizer.tokenize(text)

    print("Sentence tokenizer:")
    print(text)

    text = run_name_entity_recognizer(text)

    print("Name Entity Recognizer:")
    print(text)

    text = word_tokenizer(text)

    print("Word tokenizer:")
    print(text)

    text = words_clasifier(text, word_sets)

    print("Word clasifier")
    print(text)

    return text
예제 #29
0
def semafor_local(text):
    semafor = join(
        dirname(__file__),
        '../{0}/bin/runSemafor.sh'.format(config.get('semafor', 'base_dir')))
    input_file = join(
        dirname(__file__),
        '../{0}/bin/in.txt'.format(config.get('semafor', 'base_dir')))
    with open(input_file, 'w') as f:
        tokenizer = PunktSentenceTokenizer()
        sentences = tokenizer.tokenize(text)
        f.write('\n'.join(sentences))
    output_file = join(
        dirname(__file__),
        '../{0}/bin/out.txt'.format(config.get('semafor', 'base_dir')))
    if isfile(output_file):
        remove(output_file)
    process = subprocess.Popen([semafor, input_file, output_file, '1'],
                               shell=False)
    out, err = process.communicate(text)
    if err:
        log.debug(err)

    sentences_semantics = []
    with open(output_file) as f:
        # semafor outputs an invalid JSON, with one dictionary per line
        for line in f:
            sentence_dict = json.loads(line.rstrip())
            sentences_semantics.append(sentence_dict)

    return sentences, sentences_semantics
예제 #30
0
def preprocess(phys):
    '''
    :param fname: a text file
    :return: a json of sentences, processed for searchability
    '''

    phys = phys.decode('utf-8')
    phys = re.sub('(\n)+', '. ', phys)

    sentence_tokenizer = PunktSentenceTokenizer()
    sentences = sentence_tokenizer.tokenize(phys)

    for i in xrange(len(sentences)):
        sentence = unicode(sentences[i])
        sentence = sentence.replace('\n', ' ')
        sentence = re.sub(' +',' ',sentence)
        sentence = re.sub(r'\d+', '', sentence)
        sentence = sentence.replace("-"," ")
        exclude = string.punctuation
        sentence = ''.join(ch for ch in sentence if ch not in exclude)
        sentence = re.sub(' +',' ',sentence)
        sentences[i] = sentence
        # sentences[i] = sentence.encode('utf-8')
    count = 0
    for sentence in sentences:
        if sentence == ' ' or sentence == '':
            sentences.pop(count)
        count +=1

    # with open(fname.rstrip('txt')+'json', 'w') as outfile:
    #     json.dump(sentences, outfile)

    return sentences
예제 #31
0
파일: testrank.py 프로젝트: HsiaoCong/eg
def TextRank(document):

    #分句
    sentence_tokenizer = PunktSentenceTokenizer()
    sentences = sentence_tokenizer.tokenize(document)

    #计算词频
    c = CountVectorizer()

    #计算tf-idf
    bow_matrix = c.fit_transform(sentences)
    normalized = TfidfTransformer().fit_transform(bow_matrix)

    #获取词袋模型中所有词语
    all_words = (c.get_feature_names())

    #index2word
    index2words = {v: k for k, v in c.vocabulary_.items()}

    #根据tf-idf 输出前三个关键字索引
    top_n_idx = np.argsort(normalized.todense())[:, -3:]
    #print(top_n_idx)

    #根据tf-idf 获取top-n关键字
    top_n_words = np.vectorize(index2words.get)(top_n_idx)
    #print (top_n_words)

    #计算文本相似性
    similarity_graph = normalized * normalized.T

    #构建图计算 textrank
    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
    scores = nx.pagerank(nx_graph)
    return sorted(((scores[i], s) for i, s in enumerate(sentences)),
                  reverse=True)
예제 #32
0
파일: word.py 프로젝트: diyclassics/cltk
class PunktWordTokenizer(WordTokenizer):
    """Class for punkt word tokenization"""
    def __init__(self, sent_tokenizer: object = None):
        """
        :param language : language for sentences tokenization
        :type language: str
        """
        if sent_tokenizer:
            self.sent_tokenizer = sent_tokenizer()
        else:
            punkt_param = PunktParameters()
            self.sent_tokenizer = PunktSentenceTokenizer(punkt_param)

    def tokenize(self, text: str):
        """
        :rtype: list
        :param text: text to be tokenized into sentences
        :type text: str
        """
        sents = self.sent_tokenizer.tokenize(text)
        tokenizer = TreebankWordTokenizer()
        return [
            item for sublist in tokenizer.tokenize_sents(sents)
            for item in sublist
        ]
예제 #33
0
def myNLTKParser(document, tagger):
    lexical_diversity = len(document) / len(set(document)) * 1.0

    punkt_param = PunktParameters()
    # if any customized abbrev
    #punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])

    # tokenize to sentence
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    sentences = sentence_splitter.tokenize(document.replace('\'s', '_s'))

    # tokenize sentence to words
    word_tokens = [[
        w.strip() for w in nltk.word_tokenize(s)
        if not w.strip().lower() in stopwords
    ] for s in sentences]

    # extend token to bigram and trigram
    extended_tokens = []
    for token_list in word_tokens:
        extended_tokens.append(token_list + nltk.bigrams(token_list) +
                               nltk.trigrams(token_list))

    # word stemmer to normalize
    p_stemmer = PorterStemmer()
    stem_tokens = []
    for token_list in word_tokens:
        stem_tokens.append([p_stemmer.stem(w) for w in token_list])

    # POS tags
    tags = [tagger.tag(a) for a in extended_tokens]

    tags_of_verbs = ['NN', 'VB', 'VBP', 'VBG']
    tags_of_interest = [
        'JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNPS', 'NNS', 'RB', 'RBR', 'RBS'
    ]
    tags_of_noun = ['NN']
    merged_tags_uni = [
        word for sublist in tags for (word, tag) in sublist
        if tag in tags_of_verbs and isinstance(word, tuple) == False
    ]
    merged_tags_bi = [
        word for sublist in tags for (word, tag) in sublist if
        tag in tags_of_interest and isinstance(word, tuple) and len(word) == 2
    ]
    merged_tags_tri = [
        word for sublist in tags for (word, tag) in sublist if
        tag in tags_of_interest and isinstance(word, tuple) and len(word) == 3
    ]

    uni_tags_fd = nltk.FreqDist(merged_tags_uni)
    bi_tags_fd = nltk.FreqDist(merged_tags_bi)
    tri_tags_fd = nltk.FreqDist(merged_tags_tri)

    return {
        'uni_fd': uni_tags_fd.max(),
        'bi_fd': bi_tags_fd.max(),
        'tri_fd': tri_tags_fd.max(),
    }
예제 #34
0
    def sentences(self):
        try:
            return self.sentences_list

        except (AttributeError):
            sentence_tokenizer = SentenceTokenizer()
            self.sentences_list = sentence_tokenizer.tokenize(self.corpus)
            return self.sentences_list
예제 #35
0
    def sentences(self):
        try:
            return self.sentences_list

        except(AttributeError):
            sentence_tokenizer = SentenceTokenizer()
            self.sentences_list = sentence_tokenizer.tokenize(self.corpus)
            return self.sentences_list
예제 #36
0
 def tokenize(self, text, **kwargs):
     """
     Only behavior I want to modify is this method
     :param text:
     :param kwargs:
     """
     for x in PunktSentenceTokenizer.tokenize(self, text):
         yield x
예제 #37
0
파일: stringutil.py 프로젝트: khasathan/nlp
def _punkt_sent_tokenize(text):
    '''
     Sentence segmentation using nltk PunktSentenceTokenizer.
    '''
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(config.tokenize_abbrev)
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    return sentence_splitter.tokenize(text)
def split_sentence(caption):

    # Initialize the sentence tokenizer
    tokenizer = PunktSentenceTokenizer()
    # Tokenize the caption
    caption_tokens = tokenizer.tokenize(caption)
    # Return a list of tokens (sentences)
    return caption_tokens
 def _split_text_to_sentences(self, text):
     # splits text to sentences (uses some utilities from nltk)
     punkt_param = PunktParameters()
     punkt_param.abbrev_types = set(
         ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
     sentence_splitter = PunktSentenceTokenizer(punkt_param)
     sentences = sentence_splitter.tokenize(text)
     return sentences
예제 #40
0
class TagExtractor:
  """Extracts tags from a body of text using the NLTK toolkit."""
  
  def __init__(self):
    """Creates a default Topia tagger and extractor."""
    self.sentence_tokenizer = PunktSentenceTokenizer()
    self.parser = nltk.RegexpParser(GRAMMAR)
    self.productions = ['NP', 'VB', 'ADV']

  def __is_just_stop_words(self, words):
    return not any([word not in STOP_WORDS for word in words])

  def extract_tags(self, text):
    """Extract tags from the text."""
    tags = {} 
    for sentence in self.sentence_tokenizer.tokenize(text):
      chunks = self.__chunk_sentence(sentence)
      for production in chunks.productions():
        tag_tokens = []
        pos = production.lhs().symbol()
        if pos in self.productions:
          for (word, x) in production.rhs():
            # Preprocess, and potentially, filter out the word.
            trimmed = filter_word(trim_word(word))
            if trimmed:
              tag_tokens.append(trimmed.lower())
          if len(tag_tokens) > 0:
            tag_text = string.join(tag_tokens, ' ')
            if self.__is_just_stop_words(tag_tokens):
              continue
            tag = self.__lookup_tag(tags, tag_text, pos) 
            tag.increment_occurs()
            tag.set_pos(pos)
    results = tags.values()
    results.sort(key = tag_compare_key)
    return results

  def __lookup_tag(self, tags, text, pos):
    tag = tags.get(self.__get_tag_key(text, pos))
    if not tag:
      tag = Tag(text, 0, pos)
      tags[self.__get_tag_key(text, pos)] = tag
    return tag

  def __get_tag_key(self, text, pos):
    """I want to keep the way we look up tags flexible so that I can easily change my mind
       on what uniquely identifies a tag (e.g. just the text?  the text and the part of speech?).
       That is why all the logic for looking up tags is in this one method."""
    return text
    
  def __chunk_sentence(self, sentence):
    """Tokenize the sentence into words using a whitespace parser to avoid parsing couldn't into two tokens (could and n't).
       Then chunk the tokens according to GRAMMAR.
    """
    tokenizer = WhitespaceTokenizer()
    tokens = tokenizer.tokenize(sentence)
    pos_tagged = nltk.pos_tag(tokens)
    return self.parser.parse(pos_tagged)
예제 #41
0
 def transform(self,documents):
     sentence_splitter = PunktSentenceTokenizer()
     for doc in documents:
         if not 'sentences' in doc.ext:
             doc.ext['sentences'] = [s.strip() for s in sentence_splitter.tokenize(doc.text)]
     # for doc in documents:
     #     if not 'sentences' in doc.ext:
     #         doc.ext['sentences'] = [s.strip() for s in doc.text.split('.') if s]
     return documents
예제 #42
0
def parse (text):
    """Use nltk's PunktSentenceTokenizer to convert the text string into
    a list of English-language sentences."""

    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(ABBREVIATIONS)
    sentence_splitter = PunktSentenceTokenizer(punkt_param)

    return sentence_splitter.tokenize(preprocess(text))
예제 #43
0
def split_into_sentences(input_file_name, output_file_name):
    tokenizer = PunktSentenceTokenizer()

    with gzip.open(input_file_name) as input_file:
        with gzip.open(output_file_name, 'w') as sentence_file:
            for line in input_file:
                labelled_review = json.loads(line)
                tokenized_text = tokenizer.tokenize(labelled_review['text'])
                json.dump([tokenized_text, labelled_review['score']], sentence_file)
                sentence_file.write("\n")
예제 #44
0
	def bayesSentiment(self, text):
		from nltk.tokenize.punkt import PunktSentenceTokenizer
		from senti_classifier import senti_classifier

		# break up text into sentences
		stzr = PunktSentenceTokenizer()
		sents = stzr.tokenize(text)
		pos_score, neg_score = senti_classifier.polarity_scores(sents)
		#print pos_score, neg_score
		return [pos_score, neg_score]
예제 #45
0
def analyse_hansard_file(filename='House of Representatives_2018_05_10_6091.xml'):
    # Word frequency analysis
    my_abbrev = ['\'m', '.', ',', '\'s', '(', ')', 'n\'t', '\'ve', ';', '$', ':', '\'', '?', '\'ll', '\'re']
    stoplist = set(stopwords.words('english') + my_abbrev)
    soup, sample = parse_hansard(filename)

    # Tokenisation, tagging, chunking
    sent_tokenizer = PunktSentenceTokenizer()
    # Stop breaking sentence at "No."
    sent_tokenizer._params.abbrev_types.add('no')
    #sentences = nltk.sent_tokenize(sample)
    # TODO: improve sentence tokenizer - still far from good
    sentences = sent_tokenizer.tokenize(sample)

    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    # Word frequency over all sentences
    tokens = []
    for sentence in tokenized_sentences:
        tokens += [word for word in sentence if word.lower() not in stoplist]
    display_freq(tokens)

    # Part-of-speech analysis
    tags = []
    for sentence in tagged_sentences:
        tags += sentence
    pos_analysis(tags, my_abbrev)

    # spaCy NER
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(sample)
    # Find named entities, phrases and concepts
    ne_spacy = {}
    for entity in doc.ents:
        if entity.label_ in ne_spacy:
            ne_spacy[entity.label_] += [entity.text]
        else:
            ne_spacy[entity.label_] = [entity.text]
    logger.debug("Entity number per type: %s" % {k:len(v) for k,v in ne_spacy.items()})
    for k in ne_spacy.keys():
        display_freq(ne_spacy[k], 'Named entities (%s)' % (k,), top=20)

    # Interjection analysis
    parties = {}
    all_interjections = soup.find_all('interjection')
    for interjection in all_interjections:
        # Can be either a party or a role (Speaker, President, etc, ...)
        party = interjection.party.text or interjection.find('name', role='metadata').text
        if party in parties:
            parties[party] = parties[party] + 1
        else:
            parties[party] = 1
    logger.debug("%s interjections: %s" % (len(all_interjections), parties))
예제 #46
0
class Sent_Tokenizer():
    def __init__(self):
        with open(TREETAGGER_ABBREVIATIONLIST, mode='r', encoding='utf-8') as f:
            abbr = set([l.strip('.\n') for l in f.readlines()])
        
        punkt_param = PunktParameters()
        punkt_param.abbrev_types = abbr 
        self.tokenizer = PunktSentenceTokenizer(punkt_param)
    
    def tokenize(self, text):
        return self.tokenizer.tokenize(text)
예제 #47
0
class ReviewItem:
   def __init__(self, review, rating):
      self.tok = PunktSentenceTokenizer()
      self.rating = rating
      self.review = review

   def words(self):
      return word_tokenize(self.review)

   def sents(self):
      return [word_tokenize(sent) for sent in self.tok.tokenize(self.review)]
예제 #48
0
파일: sums.py 프로젝트: codebhendi/opinator
	def textrank(self, document):
	    sentence_tokenizer = PunktSentenceTokenizer()
	    sentences = sentence_tokenizer.tokenize(document)

	    bow_matrix = CountVectorizer().fit_transform(sentences)
	    normalized = TfidfTransformer().fit_transform(bow_matrix)

	    similarity_graph = normalized * normalized.T

	    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
	    scores = nx.pagerank(nx_graph)
	    return sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
예제 #49
0
    def __init__(self, document):
        self.document = document

        self.sumLength = 10

        self.weights = {}
        self.invWeights = {}
        self.sumIndex = {}
        self.summary = {}

        tokenizer = PunktSentenceTokenizer()
        self.sentences = [sentence.lower() for sentence in tokenizer.tokenize(document)]
예제 #50
0
def myNLTKParser(document, tagger):
    lexical_diversity = len(document) / len(set(document)) * 1.0

    punkt_param = PunktParameters()
    # if any customized abbrev
    # punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])

    # tokenize to sentence
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    sentences = sentence_splitter.tokenize(document.replace("'s", "_s"))

    # tokenize sentence to words
    word_tokens = [[w.strip() for w in nltk.word_tokenize(s) if not w.strip().lower() in stopwords] for s in sentences]

    # extend token to bigram and trigram
    extended_tokens = []
    for token_list in word_tokens:
        extended_tokens.append(token_list + nltk.bigrams(token_list) + nltk.trigrams(token_list))

        # word stemmer to normalize
    p_stemmer = PorterStemmer()
    stem_tokens = []
    for token_list in word_tokens:
        stem_tokens.append([p_stemmer.stem(w) for w in token_list])

        # POS tags
    tags = [tagger.tag(a) for a in extended_tokens]

    tags_of_verbs = ["NN", "VB", "VBP", "VBG"]
    tags_of_interest = ["JJ", "JJR", "JJS", "NN", "NNP", "NNPS", "NNS", "RB", "RBR", "RBS"]
    tags_of_noun = ["NN"]
    merged_tags_uni = [
        word for sublist in tags for (word, tag) in sublist if tag in tags_of_verbs and isinstance(word, tuple) == False
    ]
    merged_tags_bi = [
        word
        for sublist in tags
        for (word, tag) in sublist
        if tag in tags_of_interest and isinstance(word, tuple) and len(word) == 2
    ]
    merged_tags_tri = [
        word
        for sublist in tags
        for (word, tag) in sublist
        if tag in tags_of_interest and isinstance(word, tuple) and len(word) == 3
    ]

    uni_tags_fd = nltk.FreqDist(merged_tags_uni)
    bi_tags_fd = nltk.FreqDist(merged_tags_bi)
    tri_tags_fd = nltk.FreqDist(merged_tags_tri)

    return {"uni_fd": uni_tags_fd.max(), "bi_fd": bi_tags_fd.max(), "tri_fd": tri_tags_fd.max()}
예제 #51
0
def keyword_sentiment():

    ## take in tht input
    word = sys.argv[1]
    date_diff = int(sys.argv[2])
    
    ## create a sentence_tokenizer
    from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20'])
    sent_tokenizer = PunktSentenceTokenizer(punkt_param)
    
    ## caluclate the barrier date
    DD = datetime.timedelta(days=date_diff)
    barrier_date = datetime.datetime.now()- DD

    ## make connection to db and fetch tweets (and respective sentiment) above the barrier_date
    db = MySQLdb.connect(host="localhost",user="******",passwd="{2qGq(22+5iU",db="Insights")
    cur = db.cursor()
    sql = "SELECT Phrase,Sentiment FROM Phrases WHERE `Date`>'"+str(barrier_date)+"';"
    cur.execute(sql)

    total_sentiment = 0
    total_count = 0
    ## locate tweets which contain keyword, tokenize them into sentences
    for row in cur.fetchall():
        if(row[0].lower().find(word.lower())!=-1):
            sentences = sent_tokenizer.tokenize(row[0])
            
    ## if a single sentence then just take the sentiment from db
            if len(sentences) == 1:
                total_sentiment = total_sentiment + float(row[1])
                total_count = total_count+1
                
    ## else add together sentiment of sentence and keep the count
            else:
                for sentence in sentences:
                        blob = TextBlob(sentence)
                        total_sentiment= total_sentiment + int(blob.sentiment.polarity*1000)/1000.0
                        if(sentence.lower().find(word.lower())!=-1):
                            total_count = total_count+1
                            
    ## json the total_sentiment/count and count
    if(total_count!=0):
        json_array = json_array = [{"sentiment": int(total_sentiment/total_count*1000)/1000.0, "count": total_count}]
    else:
        json_array = json_array = [{"sentiment": 0, "count": 0}]
    ## close the connection to the db
    db.close()
    ## print the json
    print(json.dumps(json_array))
def build_doc2vec_model(save_file=False):
    client = MongoClient()
    db = client['metacritic']
    coll = db['steam_games']

    all_games = list(coll.find({'user_review': {"$exists": "true"},
                            'total_user_reviews': {'$ne': 0},
                            'game_name': {'$not': re.compile("Demo")} }))

    plv = PunktSentenceTokenizer()
    # stemmer = PorterStemmer()

    labeled_sentences = []
    for game in all_games:
        game_name = game['game_name']
        user_data = game['user_review']
        # critic_data = game['critic_review']

        user_reviews = user_data['reviews']

        for user_review in user_reviews:
            review = user_review['review']
            review = review.encode('ascii', 'replace')
            review = str(review).translate(string.maketrans("",""), string.punctuation)
            review_sentence = [sentence.split() for sentence in plv.tokenize(review.lower())]

            if len(review_sentence) == 0: 
                continue
            else:
                review_sentence = review_sentence[0]
                # stemmed_sentence = []
                # for word in review_sentence[0]:
                #     stemmed_sentence.append(stemmer.stem(word))

            sentence = doc2vec.LabeledSentence(words=review_sentence, labels=[game_name])
            labeled_sentences.append(sentence)

    model = Doc2Vec(alpha=0.025, min_alpha=0.025, workers=4)#, train_words=False, train_lbls=True)
    model.build_vocab(labeled_sentences)

    for epoch in range(10):
        model.train(labeled_sentences)
        model.alpha -= 0.002  # decrease the learning rate
        model.min_alpha = model.alpha  # fix the learning rate, no decay

    if save_file:
        with open('data/model.pkl', 'wb') as f_model:
            pickle.dump(model, f_model)
    else:
        return model
예제 #53
0
def splitIntoSentences2(file_name):
  punkt_param = PunktParameters()
  punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
  sentence_splitter = PunktSentenceTokenizer(punkt_param)
  fp = open(file_name)
  data = fp.read()
  data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')

  sentences = []
  for para in data.split('\n'):
    if para:
      sentences.extend(sentence_splitter.tokenize(para))
  # print '\n-----\n'.join(sentences)
  return sentences
예제 #54
0
    def test_tokenize(self):

        train = "\n".join(itertools.imap(strip_tags, itertools.chain(*(speech['text'] for speech in self.speeches[0:10]))))

        print train
        tokenizer = PunktSentenceTokenizer(train)



        sents = tokenizer.tokenize(strip_tags(self.speeches[0]['text'][0]))

        sents = tokenize_sents(strip_tags(self.speeches[0]['text'][0]))

        self.assertEqual(len(sents), 3)
예제 #55
0
파일: Model.py 프로젝트: gabruszka/SAIL
    def loadCorpus(self, path):
        
        for encoding in self.__encodings:

            try:
                self.__path = path
                fileName = codecs.open( self.__path,'r', encoding=encoding )
                self.__rawText = fileName.read()
                break
            
            except UnicodeDecodeError:
                encoding = ''
                continue
                 
        if encoding!='':
            self.initFields()
            
            #SENTENCES
            # more abbreviations with dots
            punkt_param = PunktParameters()
            punkt_param.abbrev_types = set(['dr', 'vs', 'n', 'v', 'etc', 'art', 'p', 'Cost', 'ss', 'pag'])
            
            punkt_param = PunktParameters()
            sentence_splitter = PunktSentenceTokenizer(punkt_param)
            text = re.sub(ur'[\'\<\>`’]', ' ', self.__rawText)
            #text = re.sub('(\d+)', r' \1 ', text)
            sentences = sentence_splitter.tokenize(text)
            
            #TOKENS
            self.__tokens = [[token, ''] for token in list(itertools.chain(*[ customWordtokenize(sent) for sent in sentences]))]
            wordTokenizer = RegexpTokenizer('[a-zA-Z0-9\xe0\xe1\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa]+')
            #wordTokenizer = RegexpTokenizer('[\w]+')
            
            sentences = [wordTokenizer.tokenize(sent.lower()) for sent in sentences if len(wordTokenizer.tokenize(sent)) > 0]
            words =  list(itertools.chain(*sentences))
            self.__words = words
            self.__sentences = sentences
            
            self.__avgSentLength = round(np.mean( [len(sent) for sent in sentences]), 3)
            self.__avgWordLength = round(np.mean( [len(word) for word in words]), 3)
            self.__freqDist = FreqDist(words)
            self.__wordCount = len(words)
            self.__lexicalDiversity = round(len(self.__freqDist.items())/float(len(words)), 5)
            
            ### resetting members
            self.__concordanceIndex = None
            self.__bigrams = None
                 
        return encoding
def get_reviews(games_df):
    plv = PunktSentenceTokenizer()
    reviews = games_df.excerpt.tolist()

    sentences = []
    for review in reviews:
        review = review.encode('ascii', 'replace')
        review = str(review).translate(string.maketrans("",""), string.punctuation)
        review_sentence = [sentence.split() for sentence in plv.tokenize(review.lower())]
        if len(review_sentence) == 0: 
            sentences.append([])
        else:
            sentences.extend(review_sentence)

    return sentences
예제 #57
0
def getSentences(paragraph):

	unicode_data= paragraph.decode("utf-8")
	data= "".join([i if ord(i) < 128 else "" for i in unicode_data])

	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	punkt_params = PunktParameters()
	punkt_params.abbrev_types = set(['al',"inc","mr","dr","mrs","prof"])
	splitter = PunktSentenceTokenizer(punkt_params)

	sentences=splitter.tokenize(data)
	
	sentences1=filter_list(sentences)
	##print sentences1,"\n----------------------------------------------------------------------------"
	return sentences1
예제 #58
-1
파일: word.py 프로젝트: cltk/cltk
class BasePunktWordTokenizer(BaseWordTokenizer):
    """Base class for punkt word tokenization"""

    def __init__(self, language: str = None, sent_tokenizer:object = None):
        """
        :param language : language for sentence tokenization
        :type language: str
        """
        self.language = language
        super().__init__(language=self.language)
        if sent_tokenizer:
            self.sent_tokenizer = sent_tokenizer()
        else:
            punkt_param = PunktParameters()
            self.sent_tokenizer = PunktSentenceTokenizer(punkt_param)

    def tokenize(self, text: str):
        """
        :rtype: list
        :param text: text to be tokenized into sentences
        :type text: str
        """
        sents = self.sent_tokenizer.tokenize(text)
        tokenizer = TreebankWordTokenizer()
        return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]