Python tokenize 예제들, pattern.en.tokenize Python 예제들

예제 #1

1

파일 보기

파일: analysis.py 프로젝트: markos-aivazoglou/Reveal

def sentiment_analysis(message):
	actual_range = 2
	final = []
	message = re.sub("(@[A-Za-z0-9]+)|( RT)|( rt)|(\w+:\/\/\S+)"," ",message).strip() #filter usernames,urls
	message = re.sub('#',"",message)
	message = filter(lambda x: x in string.printable, message) #filter non printable characters
	message = HTMLParser.HTMLParser().unescape(message) #unescape html
	tokenized = tokenize(message,puctuation='.!?:')
	tokenized = filter(bool,tokenized)
	tok1=[]
	for index,it in enumerate(tokenized):
		mod = mood(it)
		if '?' in it or mod=='conditional':
			continue
		tok1.append(it.strip())
	score = 0.0
	possed = [re.split(' ',sentence)for sentence in tok1]
	possed = [nltk.pos_tag(sentence) for sentence in possed]
	final = []
	for sentence in possed:
		check = []
		for entry in sentence:
			check.append(list(entry))
		final.append(check)
	range_count=0
	for sentence in final:
		sentence = dictionary_tag(sentence)
		score = score + sentiment_score(sentence)
	return score

예제 #2

0

파일 보기

파일: util.py 프로젝트: katadh/poetrymodeling

 def __iter__(self):
     if os.path.isdir(self.fname):
         filenames = [
             os.path.join(self.fname, f) for f in os.listdir(self.fname)
         ]
     else:
         filenames = [self.fname]
     for filename in filenames:
         with io.open(filename, encoding='utf-8') as f:
             squad = json.load(f)
             print "Loaded data of len", len(squad['data'])
             for d in squad['data']:
                 if self.mode == "squad":
                     yield [self.begin] + list(d["sentence"]) + [
                         self.middle
                     ] + list(d["question"]) + [self.end], list(
                         d["answer"]) + [self.end]
                 elif self.mode == "squad_word":
                     yield [self.begin
                            ] + tokenize(d["sentence"])[0].split(" ") + [
                                self.middle
                            ] + tokenize(d["question"])[0].split(" ") + [
                                self.end
                            ], tokenize(
                                d["answer"])[0].split(" ") + [self.end]
                 elif self.mode == "squad_ptr":
                     yield [self.begin] + list(d["sentence"]) + [
                         self.middle
                     ] + list(d["question"]) + [self.end], list(
                         d["answer"]) + [self.end]

예제 #3

0

파일 보기

파일: news.py 프로젝트: evijit/Tweets_NLP

def opinioncheck(line):
	sentences = tokenize(line)
	for s in sentences:
		tokens= tokenize(s)
		# print tokens
		for token in tokens:
			for word in token.split():
				if word in poslist:
					posop.append(line)
				if word in neglist:
					negop.append(line)

예제 #4

0

파일 보기

파일: util.py 프로젝트: katadh/poetrymodeling

    def __iter__(self):
        if os.path.isdir(self.fname):
            filenames = [
                os.path.join(self.fname, f) for f in os.listdir(self.fname)
            ]
        else:
            filenames = [self.fname]
        for filename in filenames:
            with open(filename) as f:
                doc = f.read()
                if self.mode == "oedilf":
                    toks = [self.begin]
                    for i, line in enumerate(doc.split("\n")):
                        if not line: continue
                        line = ''.join([
                            char for char in line.lower()
                            if char in "qwertyuioplkjhgfdsazxcvbnm "
                        ])

                        line_toks = ' '.join(tokenize(line)).split(" ") + [
                            '<br' + str(i) + '>'
                        ]
                        toks += [tok for tok in line_toks if tok != '']
                    yield toks + [self.end]
                if self.mode == "oedilf_rhymes":
                    toks = [self.begin]
                    for i, line in enumerate(doc.split("\n")):
                        if not line: continue
                        line = ''.join([
                            char for char in line.lower()
                            if char in "qwertyuioplkjhgfdsazxcvbnm "
                        ])

                        line_toks = ' '.join(tokenize(line)).split(
                            " ")[-1:] + ['<br' + str(i) + '>']
                        toks += [tok for tok in line_toks if tok != '']
                    yield toks + [self.end]
                if self.mode == "oedilf_s2s":
                    history = []
                    for i, line in enumerate(doc.split("\n")):
                        if not line: continue
                        line = ''.join([
                            char for char in line.lower()
                            if char in "qwertyuioplkjhgfdsazxcvbnm "
                        ])
                        line_toks = ' '.join(tokenize(line)).split(" ") + [
                            '<br' + str(i) + '>'
                        ]
                        line_toks = [tok for tok in line_toks if tok != '']
                        yield [self.begin] + history + [
                            self.end
                        ], line_toks + [self.end]
                        history += line_toks

예제 #5

0

파일 보기

파일: test_en.py 프로젝트: DataBranner/pattern

 def test_tokenize(self):
     # Assert list with two sentences.
     # The tokenizer should at least handle common abbreviations and
     # punctuation.
     v = en.tokenize("The cat is eating (e.g., a fish). Yum!")
     self.assertEqual(v, ["The cat is eating ( e.g. , a fish ) .", "Yum !"])
     print("pattern.en.tokenize()")

예제 #6

0

파일 보기

파일: DocumentEmbedding.py 프로젝트: subhadeepmaji/ml_algorithms

    def form_sentences(self, text_block, block_id, remove_stopwords=False,
                       stem=True, form_tagged_doc=True):
        """
        parse a block of text a form a list of word tokenized sentences
        :param text_block : single block of text as string
        :param block_id: id of the text block
        :param id : id of the text_block, used for hdfs storage
        :param remove_stopwords: remove the stopwords from the text
        :param stem: stem the words to root form
        :param form_tagged_doc: form a tagged document for the Doc2vec model
        """
        sentences = pattern.tokenize(text_block.lower())
        sentences = [sentence.replace('\'', '').replace('(', ' ').replace(')', ' ') \
                         .replace("/", " or ").replace("-", "") for sentence in sentences]
        sentences = [self.sentence_func(TAG_RE.sub('', sentence)) for sentence in sentences]

        l_stemmer = lambda w: self.stemmer(w) if stem else w
        sentences = [[l_stemmer(w) for w in word_tokenize(sentence)
                      if self.__word_filter(w, remove_stopwords)] for sentence in sentences]

        if not form_tagged_doc:
            return sentences

        sentences = [TaggedDocument(words=words, tags=[str(block_id) + ' ' + str(index)])
                     for index, words in enumerate(sentences)]

        for sentence in sentences:
            self.doc_tags[sentence.tags[0]] = sentence

        return sentences

예제 #7

0

파일 보기

def word_ranking(text, n='L2'):
    """
    extract most relevant sentences from text according to LSA algorithm
    steps:    
    1. tokenize text by sentences
    2. compute tfidf matrix
    3. applying SVD of tfidf matrix (reduce to n-dimensions) 
    4. ranking sentences according to cross-method (source: http://www.aclweb.org/anthology/C10-1098.pdf)
        
    - text: string consisting of a few sentences
    - n: number of sentences to extract
    
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    #==============================================================================
    #     #synctatic filter
    #     exclude_list = []
    #     for sent in sentences:
    #         for word, pos in tag(sent):
    #             if pos != "JJ" or pos != 'NN': # Retrieve all adjectives and nouns.
    #                 exclude_list.append(word.lower())
    #==============================================================================

    # create documents list
    # stop words and punctuation erase by default
    docs = [Document(sentences[i], name=i) for i in range(len(sentences))]

    # model initialize
    m = Model(docs, weight=TFIDF)

    # dimensions number equal to euclidean norm of singular values
    # U, S, Vt = np.linalg.svd(m.vectors, full_matrices=False)
    # dimensions=int(round(np.linalg.norm(S, 2)))
    m.reduce(dimensions=n)

    # sentences selection according to cross-method
    # source: http://www.ceng.metu.edu.tr/~e1395383/papers/TextSummarizationUsingLSA(Journal).pdf
    # topic(rows) x tokens(cols) matrix(tfidf)
    V = np.array(m.lsa.vt)

    # average sentence score for each concept/topic by the rows of the Vt matrix
    avg_score = np.mean(V, axis=1).reshape((-1, 1))

    # cell values which are less than or equal to the average score are set to zero
    V[V <= avg_score] = 0.0

    # sigma natrix after svd performing
    S = np.array(m.lsa.sigma).reshape((-1, 1))

    # total length of each sentence vector
    length = np.sum(V * S, axis=0)

    # ranking words by length score
    ranking = Counter(dict(zip(m.lsa.terms, length)))  #.most_common(n)

    #words, score =  list(zip(*ranking))

    return ranking

예제 #8

0

파일 보기

파일: machine.py 프로젝트: johan--/patent-generator

 def sentence_walk(self):
     output = []
     sents = tokenize(self.source_text)
     words = set(search.hypernym_search(self.source_text, "artifact"))
     pat = re.compile(" " + "|".join(words) + " ")
     sents = [s for s in sents if pat.search(s) != None]
     pprint(sents)

예제 #9

0

파일 보기

def dispersion(text, keywords):
    """
    Dispersion of occurence of given keywords among given text
    - text: string 
    - keywords: list of keywords
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    # tokenize by words
    tokens = []
    for sent in sentences:
        for w in sent.lower().split():
            tokens.append(w)

    n_tokens = len(tokens)
    n_words = len(keywords)
    disp = []

    for x in range(n_tokens):
        for y in range(n_words):
            if tokens[x] == keywords[y]:
                disp.append((x, y))

    x, y = list(zip(*disp))
    return x, y

예제 #10

0

파일 보기

 def test_tokenize(self):
     # Assert list with two sentences.
     # The tokenizer should at least handle common abbreviations and
     # punctuation.
     v = en.tokenize("The cat is eating (e.g., a fish). Yum!")
     self.assertEqual(v, ["The cat is eating ( e.g. , a fish ) .", "Yum !"])
     print("pattern.en.tokenize()")

예제 #11

0

파일 보기

파일: summarize.py 프로젝트: xmonkee/Shards

def summarize(text_to_summarize):
    stokens = tokenize(text_to_summarize)
 
    # STEP 1
    # pattern.vector's Document is a nifty bag-o-words structure,
    # with a TF weighting scheme
    docs = [Document(string= s, name=e,stemmer=LEMMA)
            for e,s in enumerate(stokens) if len(s.split(" ")) > 7]
    
    linkgraph = []
    # STEP 2 and 3 happen interwovenly
    for doc in docs:
        for doc_copy in docs:
            if doc.name != doc_copy.name:
                # STEP 2 happens here
                wordset_a = [x[1] for x in doc.keywords()]
                wordset_b = [y[1] for y in doc_copy.keywords()]
                jacc_dist = distance.jaccard(wordset_a, wordset_b)
                if jacc_dist < 1:
                    linkgraph.append((str(doc.name), #index to sentence
                                      str(doc_copy.name),1-jacc_dist)) #dist. score
    # By the time we reach here, we'd have completed STEP 3
    
    # STEP 4
    #I referenced this SO post for help with pagerank'ing
    #http://stackoverflow.com/questions/9136539/how-to-weighted-edges-affect-pagerank-in-networkx
    D=nx.DiGraph()
    D.add_weighted_edges_from(linkgraph)
    pagerank = nx.pagerank(D)
    sort_pagerank = sorted(pagerank.items(),key=operator.itemgetter(1))
    sort_pagerank.reverse()
    top2 = sort_pagerank[:2]
    orderedtop2 = [int(x[0]) for x in top2]
    orderedtop2 = sorted(orderedtop2)
    return " ".join([ stokens[i] for i in orderedtop2 ])

예제 #12

0

파일 보기

def summarize(text, n=2):
    """
    determine most informative sentences by summarizing words ranks
    which occure in the corresponding  sentences
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    # tokenize sentence list by words
    words_sent = [sent.lower().split() for sent in sentences]

    # words ranking
    w_ranking = word_ranking(text, n)

    # sents ranking = sum of words score
    s_ranking = defaultdict(int)

    for i, sent in enumerate(words_sent):
        for word in sent:
            if word in w_ranking:
                s_ranking[i] += w_ranking[word]

    # placed sents ranking into high-performance container
    s_ranking = Counter(s_ranking)

    # get top n sents indexes with scores
    sents_idx = s_ranking.most_common(n)

    output = [sentences[j[0]] for j in sents_idx]

    # reordering
    output.sort(lambda s1, s2: text.find(s1) - text.find(s2))

    return ' '.join(output)

예제 #13

0

파일 보기

파일: TestServer.py 프로젝트: mxalbert1996/review-classification

 def do_POST(self):
     form = cgi.FieldStorage(fp=self.rfile,
                             headers=self.headers,
                             environ={
                                 'REQUEST_METHOD': 'POST',
                                 'CONTENT_TYPE':
                                 self.headers['Content-Type'],
                             })
     if self.path != '/predict' or 'text' not in form.keys():
         self.send_response(404)
         self.end_headers()
         return 404
     self.send_response(200)
     self.send_header("Content-type", 'text/plain')
     self.end_headers()
     text = ' '.join(
         tokenize(
             re.sub('([a-z][.!?]+)([A-Z])', '\g<1> \g<2>',
                    form['text'].value, 0))).lower().split()
     x = [[w2indx.get(word, 0) for word in text]]
     x = sequence.pad_sequences(x,
                                maxlen=200,
                                padding='post',
                                truncating='post')
     predict = model.predict_classes(x)[0][0]
     self.wfile.write(bytes(LBL[predict], encoding='utf8'))
     return 200

예제 #14

0

파일 보기

def sentiment_analysis(message):
    actual_range = 2
    final = []
    message = re.sub("(@[A-Za-z0-9]+)|( RT)|( rt)|(\w+:\/\/\S+)", " ",
                     message).strip()  #filter usernames,urls
    message = re.sub('#', "", message)
    message = filter(lambda x: x in string.printable,
                     message)  #filter non printable characters
    message = HTMLParser.HTMLParser().unescape(message)  #unescape html
    tokenized = tokenize(message, puctuation='.!?:')
    tokenized = filter(bool, tokenized)
    tok1 = []
    for index, it in enumerate(tokenized):
        mod = mood(it)
        if '?' in it or mod == 'conditional':
            continue
        tok1.append(it.strip())
    score = 0.0
    possed = [re.split(' ', sentence) for sentence in tok1]
    possed = [nltk.pos_tag(sentence) for sentence in possed]
    final = []
    for sentence in possed:
        check = []
        for entry in sentence:
            check.append(list(entry))
        final.append(check)
    range_count = 0
    for sentence in final:
        sentence = dictionary_tag(sentence)
        score = score + sentiment_score(sentence)
    return score

예제 #15

0

파일 보기

파일: clean_text_processor.py 프로젝트: amitmohapatra/SentimentAnalysis

    def clean_text(text):
        """
        :param text: text as str
        :return: list of sentences
        """

        try:
            text = text.strip()
            if text:
                final_sentences = []
                token_text = tokenize(text)

                for sentence in token_text:
                    words = sentence.split()
                    cleaned_tokens = [
                        porter_stemmer.stem(word) for word in words
                        if word not in punctuation
                    ]
                    cleaned_sent = " ".join(cleaned_tokens)
                    cleaned_sent = CleanTextProcessor.clean_not_words(
                        cleaned_sent)
                    cleaned_sentence = cleaned_sent + "."
                    final_sentences.append(cleaned_sentence)
                return final_sentences
            else:
                return []
        except:
            trace_err = StackTrace.get_stack_trace()
            msg = "CleanTextProcessor (clean_text()) : %s%s" % ("\n",
                                                                trace_err)
            log.error(msg)
            raise Exception(msg)

예제 #16

0

파일 보기

def summarize(text, n=1):
    """
    extract most relevant sentences from text according to TextRank algorithm
    - text: string consisting of a few sentences
    - n: number of sentences to extract
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    # create documents list
    # stop words and punctuation erase by default
    docs = [Document(sentences[i], name=i) for i in range(len(sentences))]

    # model initialize
    m = Model(docs, weight=TFIDF)

    # dict of TextRank ranking of cosine similarity matrix
    ranking = utils.textrank(m.documents, m.distance)

    # indexes of top n sentences
    top_sents_idx, _ = list(zip(*ranking.most_common(n)))

    # reordering
    output = [sentences[i] for i in sorted(top_sents_idx)]

    return ''.join(output)

예제 #17

0

파일 보기

def keywords(text, n=15):
    """
    extract most relevant keywords from given text
    steps:    
    1. tokenize text by words
    2. applying synctatic filter
    3. compute pairwise levenshtein distance
    4. create graph based on cosine distance matrix
    5. compute pagerank
    
    - text: string consisting of a few sentences
    - n: number of keywords to extract
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    #synctatic filter
    words = []
    for sent in sentences:
        for word, pos in tag(sent):
            if pos == "JJ" or pos == 'NN':  # Retrieve all adjectives and nouns.
                words.append(word.lower())

    # dict of TextRank ranking of levenshtein distance matrix
    ranking = utils.textrank(words, utils.levenshtein)

    # top n keywords
    keywords, scores = list(zip(*ranking.most_common(n)))
    return keywords, scores

예제 #18

0

파일 보기

파일: machine.py 프로젝트: rishabhdhenkawat/patent-generator

 def sentence_walk(self):
     output = []
     sents = tokenize(self.source_text)
     words = set(search.hypernym_search(self.source_text, 'artifact'))
     pat = re.compile(' ' + '|'.join(words) + ' ')
     sents = [s for s in sents if pat.search(s) != None]
     pprint(sents)

예제 #19

0

파일 보기

파일: util.py 프로젝트: henningpeters/sift

def ngrams(text, n=1, lowercase=False):
    for s in tokenize(text):
        if lowercase:
            s = s.lower()
        s = s.split()
        for i in xrange(n):
            for j in xrange(len(s)-i):
                yield ' '.join(s[j:j+i+1])

예제 #20

0

파일 보기

def split_text_to_list_of_sentences(raw_text):
    """ Split the raw text into list of sentences.
        Args:
            raw_text (str): text input in paragraphs.
        Returns:
            (list): list of str of sentences.
    """
    return tokenize(raw_text)

예제 #21

0

파일 보기

파일: machine.py 프로젝트: johan--/patent-generator

    def key_sentences(self):
        words = set(search.hypernym_search(self.source_text, "instrumentality"))
        sents = tokenize(self.source_text)
        pat = re.compile(" " + "|".join(words) + " ")
        sents = [s for s in sents if pat.search(s) != None]

        pprint(sents)
        pprint(words)

예제 #22

0

파일 보기

파일: try_pattern_2.py 프로젝트: folagit/resumatcher

def test_findTonkens_3():
    s = "I eat pizza with a fork."
    s = "Bachelor's degree in Computer Science or equivalent"
    s = "B.S. in Computer Science, a related degree or its equivalent "     
    s = "What's this? This is a book."  
    from pattern.en import tokenize     
    result = tokenize(s)
    print result

예제 #23

0

파일 보기

파일: Pattern_Parsing.py 프로젝트: nakamichikun/google_search_module_alt

def split_text_to_list_of_sentences(raw_text):
    """ Split the raw text into list of sentences.
        Args:
            raw_text (str): text input in paragraphs.
        Returns:
            (list): list of str of sentences.
    """
    return tokenize(raw_text)

예제 #24

0

파일 보기

파일: try_pattern_2.py 프로젝트: folagit/resumatcher

def test_findTonkens_3():
    s = "I eat pizza with a fork."
    s = "Bachelor's degree in Computer Science or equivalent"
    s = "B.S. in Computer Science, a related degree or its equivalent "
    s = "What's this? This is a book."
    from pattern.en import tokenize
    result = tokenize(s)
    print result

예제 #25

0

파일 보기

def sentance_break(origin_text):
    """ Input: output text from gutenberg_text_gather
		Output: tokenized text, a list of strings 
		where the strings are the sentances 
	"""
    text = tokenize(
        origin_text,
    )  # using patter to break string of text apart in to a list of strings where each string is a sentace
    return text

예제 #26

0

파일 보기

파일: VerbForm.py 프로젝트: kunalmaurya/ml_algorithms

 def form_sentences(self):
     f_p = open(CORPUS_FILE, "rbU")
     corpus_sentences = pattern.tokenize(f_p.read())
     f_p.close()
     self.sentences = defaultdict(list)
     for sentence in corpus_sentences:
         for v in VERBS:
             if sentence.find(" " + v + " ") != -1:
                 self.sentences[v].append(sentence)

예제 #27

0

파일 보기

파일: machine.py 프로젝트: rishabhdhenkawat/patent-generator

    def key_sentences(self):
        words = set(search.hypernym_search(self.source_text,
                                           'instrumentality'))
        sents = tokenize(self.source_text)
        pat = re.compile(' ' + '|'.join(words) + ' ')
        sents = [s for s in sents if pat.search(s) != None]

        pprint(sents)
        pprint(words)

예제 #28

0

파일 보기

파일: util.py 프로젝트: chaitanyamalaviya/seq2seq-mmi

 def __iter__(self):
     if os.path.isdir(self.fname):
         filenames = [os.path.join(self.fname,f) for f in os.listdir(self.fname)]
     else:
         filenames = [self.fname]
     for filename in filenames:
         with open(filename) as f:
             doc = f.read()
             if self.mode == "ohhla":
                 toks = [self.begin]
                 for line in doc.split("\n"):
                     if not line: continue
                     toks +=  ' '.join(tokenize(line)).split(" ") + ['<br>']
                 yield toks + [self.end]
             elif self.mode == "ohhla_line_pairs":
                 lines = [tokenize(line) for line in doc.split("\n")]
                 for l1, l2 in zip(lines, lines[1:]):
                     inp_toks = [self.begin] + ' '.join(l1).split(" ") + [self.end]
                     outp_toks = ' '.join(l2).split(" ") + [self.end]
                     yield (inp_toks, outp_toks)

예제 #29

0

파일 보기

파일: preprocess.py 프로젝트: i-Hun/thesis-code

def tokenize_pattern(text):
    """
    The tokenize() function returns a list of sentences, with punctuation marks split from words.
    """
    sents = tokenize(text, punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_«»…".decode("utf8"), replace={})
    """
    Возвращает список предложений вида
    Теперь , в 2014 году , голая Дженнифер Лоуренс появилась в Интернете за полтора месяца до всемирной премьеры первой части последней серии трилогии « Голодные игры : Сойка-пересмешница » ( The Hunger Games : Mockingjay – Part 1 ) .
    """
    tokens = [token.lower() for sent in sents for token in sent.split()]
    log.debug("Tokenize with Pattern")
    return tokens

예제 #30

0

파일 보기

    def __call__(self, org_doc):

        doc = org_doc

        tokens = doc.lower().split()
        ldoc = ' '.join([x for x in tokens if "_" not in x])

        # Identify which phrases were used
        keywords = [key for key in self.X if key in ldoc]
        punctuation = ".,;:!?()[]{}`''\"@#$^&*+-|=~"

        # Loop over the keywords and replace them one-by-one.
        # This is inefficient, but less error prone.

        parsed_sent = []

        for sent in tokenize(doc, punctuation=punctuation):

            for word in keywords:
                word_n_tokens = len(word.split())

                new_word = self.X[word]
                word_tokens = word.split()

                # Check if the substring tokens match
                tokens = sent.lower().split()
                mask = contains_sublist(tokens, word_tokens)
                while any(mask):
                    idx = mask.index(True)
                    sent = sent.split()
                    args = sent[:idx] + [
                        new_word,
                    ] + sent[idx + word_n_tokens:]
                    sent = ' '.join(args)
                    tokens = sent.lower().split()
                    mask = contains_sublist(tokens, word_tokens)

            parsed_sent.append(sent)

        doc = ' '.join(parsed_sent)
        """
        # Change the punctuation to a more readable format for debugging
        punc_compress = ''').,?!':'''
        for punc in punc_compress:
            doc = doc.replace(' '+punc,punc)

        punc_compress = '''('''
        for punc in punc_compress:
            doc = doc.replace(punc+' ',punc)
        """

        return doc

예제 #31

0

파일 보기

파일: replace_from_dict.py 프로젝트: NIHOPA/pipeline_word2vec

    def __call__(self,org_doc):

        doc = org_doc

        tokens = doc.lower().split()
        ldoc = ' '.join([x for x in tokens if "_" not in x])

        # Identify which phrases were used
        keywords = [key for key in self.X if key in ldoc]
        punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~"           
        
        # Loop over the keywords and replace them one-by-one.
        # This is inefficient, but less error prone.

        parsed_sent = []

        for sent in tokenize(doc, punctuation=punctuation):
            
            for word in keywords:
                word_n_tokens = len(word.split())
                worn_n = len(word)
                
                new_word = self.X[word]
                word_tokens = word.split()

                # Check if the substring tokens match
                tokens = sent.lower().split()
                mask = contains_sublist(tokens, word_tokens)
                while any(mask):
                    idx = mask.index(True)
                    sent = sent.split()
                    args = sent[:idx] + [new_word,] + sent[idx+word_n_tokens:]
                    sent = ' '.join(args)
                    tokens = sent.lower().split()
                    mask = contains_sublist(tokens, word_tokens)

            parsed_sent.append(sent)
        
        doc = ' '.join(parsed_sent)

        """
        # Change the punctuation to a more readable format for debugging
        punc_compress = ''').,?!':'''
        for punc in punc_compress:
            doc = doc.replace(' '+punc,punc)

        punc_compress = '''('''
        for punc in punc_compress:
            doc = doc.replace(punc+' ',punc)
        """

        return doc

예제 #32

0

파일 보기

    def __call__(self, data):
        splitted_body = self.get_enrichment(data, 'sentence_splitter')

        tokenized = []
        for paragraph in splitted_body:
            if 'content' in paragraph and paragraph['content']:
                # Tokenize the splitted sentences and
                # join potential sentence splits detected by pattern
                tokenized_sentences = [' '.join(tokenize(s))
                                       for s in paragraph['content']]
                tokenized.append({'content': tokenized_sentences, 'type': paragraph['type']})

        return self.add_enrichment(data, self.name, tokenized)

예제 #33

0

파일 보기

파일: word2vector.py 프로젝트: haochengqian/demo

 def __iter__(self):
     for root, dirs, files in os.walk(self.dirname):
         for filename in files:
             file_path = root + '/' + filenam
             for line in open(file_path):
                 sline = line.strip()
                 if sline == "":
                     continue
                 rline = cleanhtml(sline)
                 tokenized_line = ' '.join(tokenize(rline))
                 is_alpha_word_line = [word for word in
                                       tokenized_line.lower().split()
                                       if word.isalpha()]
                 yield is_alpha_word_line

예제 #34

0

파일 보기

파일: train_word2vec_with_jieba.py 프로젝트: baby-H/MachineStudy

 def __iter__(self):
     for root, dirs, files in os.walk(self.dirname):
         for filename in files:
             file_path = root + '/' + filename
             for line in open(file_path):
                 sline = line.strip()
                 if sline == "":
                     continue
                 rline = clean_html(sline)
                 tokenized_line = r' '.join(tokenize(rline))
                 is_alpha_word_line = [
                     word
                     for word in jieba.cut(tokenized_line, cut_all=False)
                     if word.isalpha()
                 ]
                 yield is_alpha_word_line

예제 #35

0

파일 보기

파일: WordEmbedding.py 프로젝트: subhadeepmaji/ml_algorithms

    def form_sentences(self, text_block, remove_stopwords=False, stem=True):
        """
        parse a block of text a form a list of word tokenized sentences 
        :param text_block : single block of text as string 
        :param id : id of the text_block, used for hdfs storage
        :param remove_stopwords: remove the stopwords from the text
        :param stem: stem the words to root form 
        """
        sentences = pattern.tokenize(text_block.lower())
        sentences = [sentence.replace('\'', '').replace('(', ' ').replace(')', ' ') \
                         .replace("/", " or ").replace("-", "") for sentence in sentences]
        sentences = [self.sentence_func(TAG_RE.sub('', sentence)) for sentence in sentences]

        l_stemmer = lambda w: self.stemmer(w) if stem else w
        sentences = [[l_stemmer(w) for w in word_tokenize(sentence)
                      if self.__word_filter(w, remove_stopwords)] for sentence in sentences]
        return sentences

예제 #36

0

파일 보기

파일: filter.py 프로젝트: ochachacha/contextualLSTM

def _transform_file(file_path, w2id, split_par=False, debug=False):
    """
    Transforms a file containing articles into a 4D list of words divided into sentences,
    paragraphs and docs. Write the result to disk with the name filename_clean.pklz
    :param file_path: file to transform
    """
    if debug:
        print("Cleaning %s" % file_path)
    with open(file_path) as f:
        data = f.read().decode("latin-1")
        docs = data.split("</doc>")
        del data
    if not split_par:
        file_out = "%s_clean_simple" % file_path
    else:
        file_out = "%s_clean_paragraph" % file_path
    file_string = ""
    for doc in [d.strip() for d in docs if d.strip()]:
        paragraphs = [
            tokenize(par)
            for par in remove_title(cleanhtml(doc)).strip().split("\n\n")
            if par
        ]
        doc_a = False
        for p in paragraphs:
            par_a = False
            for sent in p:
                line = [
                    word for word in sent.lower().split()
                    if word.isalpha() or is_number(word)
                ]

                line = " ".join([known(word, w2id) for word in line])
                if line:
                    file_string += line + " <eos> "
                    par_a = True

            if par_a and split_par:
                file_string += " <eop> "

    VectorManager.write_string(file_out, file_string.encode("latin-1"))
    del file_string
    if debug:
        print("Done with %s" % file_path)

예제 #37

0

파일 보기

def _transform_file(file_path, debug=False):
    """
    Transforms a file containing articles into a 4D list of words divided into sentences,
    paragraphs and docs. Write the result to disk with the name filename_wl (words list)
    :param file_path: file to transform
    """
    if debug:
        print("Cleaning %s" % file_path)
    with open(file_path) as f:
        raw = f.read().decode("latin-1")
        data = cleanhtml(raw)
        docs = data.split("</doc>")
        del data
    file_out = "%s_wl" % file_path
    file_string = ""
    for doc in [d.strip() for d in docs if d.strip()]:
        paragraphs = [
            tokenize(par)
            for par in remove_title(cleanhtml(doc)).strip().split("\n\n")
            if par
        ]
        doc_a = False
        for p in paragraphs:
            par_a = False
            for sent in p:
                line = " ".join([
                    word for word in sent.lower().split()
                    if word.isalpha() or is_number(word)
                ])
                if line:
                    file_string += line + "\n"
                    par_a = True
                    doc_a = True

            if par_a:
                file_string += "\n"
        if doc_a:
            file_string += "\n"

    VectorManager.write_string(file_out, file_string.encode("latin-1"))
    del file_string
    if debug:
        print("Done with %s" % file_path)

예제 #38

0

파일 보기

파일: nonet_generator_object_version.py 프로젝트: Mikhaela/TextMining

	def get_raw_text(self):
		""" gutenberg_text_gather take a text from gutenberg url
			and stores it to a file. It only pulls from gutenberg 
			when given the command True. By default the command is False. 
			This function outputs self.raw_text, which 
			is a tokenized text file of my gutenberg book. 
		""" 
		if self.command: # If I tell it to load data from url
			buddhist_psalm_text = URL(self.url).download()

			# Save data to a file (will be part of your data fetching script)
			f = open('buddhist_psalm_text.pickle','wb')
			pickle.dump(buddhist_psalm_text,f)
			f.close()

		# Load data from a file (will be part of your data processing script)
		input_file = open('buddhist_psalm_text.pickle','rb')
		# Use pattern to break string of text in to a list of strings where each string is a sentace 
		self.raw_text = tokenize(pickle.load(input_file),)

예제 #39

0

파일 보기

    def form_sentences(self,
                       text_block,
                       block_id,
                       remove_stopwords=False,
                       stem=True,
                       form_tagged_doc=True):
        """
        parse a block of text a form a list of word tokenized sentences
        :param text_block : single block of text as string
        :param block_id: id of the text block
        :param id : id of the text_block, used for hdfs storage
        :param remove_stopwords: remove the stopwords from the text
        :param stem: stem the words to root form
        :param form_tagged_doc: form a tagged document for the Doc2vec model
        """
        sentences = pattern.tokenize(text_block.lower())
        sentences = [sentence.replace('\'', '').replace('(', ' ').replace(')', ' ') \
                         .replace("/", " or ").replace("-", "") for sentence in sentences]
        sentences = [
            self.sentence_func(TAG_RE.sub('', sentence))
            for sentence in sentences
        ]

        l_stemmer = lambda w: self.stemmer(w) if stem else w
        sentences = [[
            l_stemmer(w) for w in word_tokenize(sentence)
            if self.__word_filter(w, remove_stopwords)
        ] for sentence in sentences]

        if not form_tagged_doc:
            return sentences

        sentences = [
            TaggedDocument(words=words,
                           tags=[str(block_id) + ' ' + str(index)])
            for index, words in enumerate(sentences)
        ]

        for sentence in sentences:
            self.doc_tags[sentence.tags[0]] = sentence

        return sentences

예제 #40

0

파일 보기

파일: graph_tools.py 프로젝트: cosbynator/karma-prediction-cs224w

def text_sentiment(text):
    if not text:
        return default_sentiment
    sentences = tokenize(plaintext(text))
    sentiments = [sentiment(s) for s in sentences]
    average_polarity = np.mean([s[0] for s in sentiments])
    std_polarity = np.std([s[0] for s in sentiments])
    average_subjectivity = np.mean([s[1] for s in sentiments])
    std_subjectivity = np.std([s[1] for s in sentiments])

    if math.isnan(average_polarity):
        average_polarity = 0.0
    if math.isnan(std_polarity):
        std_polarity = 0.0
    if math.isnan(average_subjectivity):
        average_subjectivity = 0.0
    if math.isnan(std_subjectivity):
        std_subjectivity = 0.0

    return Sentiment(average_polarity, std_polarity, average_subjectivity, std_subjectivity, len(sentences))

예제 #41

0

파일 보기

파일: WordEmbedding.py 프로젝트: kunalmaurya/ml_algorithms

    def form_sentences(self, text_block, remove_stopwords=False, stem=True):
        """
        parse a block of text a form a list of word tokenized sentences 
        :param text_block : single block of text as string 
        :param id : id of the text_block, used for hdfs storage
        :param remove_stopwords: remove the stopwords from the text
        :param stem: stem the words to root form 
        """
        sentences = pattern.tokenize(text_block.lower())
        sentences = [sentence.replace('\'', '').replace('(', ' ').replace(')', ' ') \
                         .replace("/", " or ").replace("-", "") for sentence in sentences]
        sentences = [
            self.sentence_func(TAG_RE.sub('', sentence))
            for sentence in sentences
        ]

        l_stemmer = lambda w: self.stemmer(w) if stem else w
        sentences = [[
            l_stemmer(w) for w in word_tokenize(sentence)
            if self.__word_filter(w, remove_stopwords)
        ] for sentence in sentences]
        return sentences

예제 #42

0

파일 보기

파일: summarize.py 프로젝트: davinirjr/summarize

def summarize(text, sentence_count=2):
    sentence_list = tokenize(text)

    # each document's name is the sentence's original index
    # so that we can put them back together later
    docs = [Document(string=sentence, name=index, stemmer=LEMMA)
            for index, sentence in enumerate(sentence_list)]

    graph = Graph()
    for doc_a, doc_b in combinations(docs, 2):
        wordset_a = [x[1] for x in doc_a.keywords()]
        wordset_b = [y[1] for y in doc_b.keywords()]
        similarity = 1 - jaccard(wordset_a, wordset_b)
        if similarity > 0:
            graph.add_edge(doc_a.name, doc_b.name, weight=similarity)

    ranked_sentence_indexes = pagerank(graph).items()
    sentences_by_rank = sorted(
        ranked_sentence_indexes, key=itemgetter(1), reverse=True)
    best_sentences = map(itemgetter(0), sentences_by_rank[:sentence_count])
    best_sentences_in_order = sorted(best_sentences)

    return ' '.join(sentence_list[index] for index in best_sentences_in_order)

예제 #43

0

파일 보기

파일: try_pattern_2.py 프로젝트: folagit/resumatcher

def test_tokenize():
    from pattern.en import tokenize   
    
    sent = "Randstad Technologies - Baltimore , MD - June 2014 to Present Responsibilities Johns Hopkins University , Krieger School of Arts & Sciences June 2014 - present Input Content for websites using the WordPress interface Modified and configured WordPress plug-ins and themes to match design Created Email template for Dean 's Newsletter Launched website and created redirects using .htaccess and Apache conf file"
    lines =  tokenize(sent)
    print lines

예제 #44

0

파일 보기

파일: RelationExtractor.py 프로젝트: subhadeepmaji/ml_algorithms

    def form_relations(self, text, block_id, payload, ff, persist=True):
        """
        form relation(s) on a given text
        :param text: text on which to get the relations on,
        text will be sentence tokenized and relations formed at sentence level
        :param block_id: unique identifier of the block
        :param persist: persist the relations extracted from the text in the sink,
        relation_sink needed to be specified
        :return: list of relations
        """
        text_sentences = pattern.tokenize(text)
        relations = []
        for sentence in text_sentences:

            # work with ascii string only
            sentence = "".join((c for c in sentence if 0 < ord(c) < 127))
            try:
                senna_annotation = self.relation_annotator.getAnnotations(sentence)
            except Exception as e:
                logger.error(e)
                continue

            chunk_parse, pos_tags, role_labeling, tokenized_sentence = \
                senna_annotation['chunk'], senna_annotation['pos'], senna_annotation['srl'], \
                senna_annotation['words']

            # nothing to do here empty srl
            if not role_labeling: continue

            for semantic_element in role_labeling:
                arguments = RelationExtractor.__populate_arguments(semantic_element)
                modifiers = RelationExtractor.__populate_modifier(semantic_element)
                verb = semantic_element.get('V')
                # order of the arguments returned is important, A0 --> A1 --> A2 --> A3
                arguments = [v for v in vars(arguments).itervalues() if v]
                modifiers = [v for v in vars(modifiers).itervalues() if v]

                if not arguments: continue
                argument_pairs = [e for e in ((ai, aj) for i, ai in enumerate(arguments) for j, aj
                                              in enumerate(arguments) if i < j)]

                verb = relation_util.normalize_relation(verb)

                for a0, a1 in argument_pairs:
                    en0 = relation_util.form_entity(tokenized_sentence, a0, chunk_parse, pos_tags)
                    en1 = relation_util.form_entity(tokenized_sentence, a1, chunk_parse, pos_tags)
                    if not en0 or not en1: continue
                    relations.append(RelationTuple(left_entity=en0, right_entity=en1, relation=verb,
                                                   sentence=sentence, text=text, block_id=block_id,
                                                   payload=payload, ff = ff))
                    logger.info("generated a relation for ")
                    logger.info(block_id)

                for arg_modifier in modifiers:
                    mod_pos = sentence.find(arg_modifier)
                    linked_arg = min([(a, abs(mod_pos - sentence.find(a))) for a in arguments], key=lambda e: e[1])[0]
                    en0 = relation_util.form_entity(tokenized_sentence, linked_arg, chunk_parse, pos_tags)
                    en1 = relation_util.form_entity(tokenized_sentence, arg_modifier, chunk_parse, pos_tags)
                    if not en0 or not en1: continue
                    relations.append(RelationTuple(left_entity=en0, right_entity=en1, relation=verb,
                                                   sentence=sentence, text=text, block_id=block_id,
                                                   payload=payload, ff=ff))
                    logger.info("generated a relation for ")
                    logger.info(block_id)

        return relations

예제 #45

0

파일 보기

파일: opinion.py 프로젝트: AvijitGhosh82/Tweets_NLP

for line in neg:
	for v in line.split("\n"):
		if v:
			if v[0] != ';':
				neglist.append(v.strip())

print poslist
print neglist
print lines

poslist = filter(None, poslist)
neglist = filter(None, neglist)

for line in lines:
		sentences = tokenize(line)
		for s in sentences:
			tokens= tokenize(s)
			for word in tokens:
				if word in poslist:
					posop.append(line)
				elif word in neglist:
					negop.append(line)

posop=list(set(posop))
negop=list(set(negop))

print "positive"
for p in posop:
	print p
print "negative"

예제 #46

0

파일 보기

파일: opinion.py 프로젝트: evijit/Tweets_NLP

for line in neg:
    for v in line.split("\n"):
        if v:
            if v[0] != ';':
                neglist.append(v.strip())

print poslist
print neglist
print lines

poslist = filter(None, poslist)
neglist = filter(None, neglist)

for line in lines:
    sentences = tokenize(line)
    for s in sentences:
        tokens = tokenize(s)
        for word in tokens:
            if word in poslist:
                posop.append(line)
            elif word in neglist:
                negop.append(line)

posop = list(set(posop))
negop = list(set(negop))

print "positive"
for p in posop:
    print p
print "negative"

예제 #47

0

파일 보기

파일: jobdescparser.py 프로젝트: folagit/resumatcher

def splitSentences(text):
  #  return nltk.tokenize.sent_tokenize(text)
  # use pattern package
    return tokenize(text)

예제 #48

0

파일 보기

파일: question_tester.py 프로젝트: mrittha/erotao

    verb=' '.join(bits_to_words(basic_sentence['VP']))

    if verb=='is':
        return "What is "+sbj.lower()+"? "+obj

    return "What does "+sbj.lower()+" "+lemma(verb.lower())+"?"+" "+obj







text="""
A star is a massive ball of plasma (very hot gas) held together by gravity. It radiates energy because of the nuclear reactions inside it

It radiates heat and light, and every other part of the electromagnetic spectrum, such as radio waves, micro-waves, X-rays, gamma-rays and ultra-violet radiation. The proportions vary according to the mass and age of the star.

The energy of stars comes from nuclear fusion. This is a process that turns a light chemical element into another heavier element. Stars are mostly made of hydrogen and helium. They turn the hydrogen into helium by fusion. When a star is near the end of its life, it begins to change the helium into other heavier chemical elements, like carbon and oxygen. Fusion produces a lot of energy. The energy makes the star very hot. The energy produced by stars radiates away from them. The energy leaves as electromagnetic radiation.
"""
sentences=tokenize(text)
basic_sentences=[]
for sentence in sentences:
    print sentence
    basic_sentences=basic_sentences+gather_question_bits(sentence)

basic_sentences=convert_pp(basic_sentences)
for sentence in basic_sentences:
    print basic_sentence_to_question(sentence)

예제 #49

0

파일 보기

파일: testing_Pattern.py 프로젝트: seanli310/NLP-project

#refer to http://textminingonline.com/getting-started-with-pattern

from pattern.en import tokenize

f = """this’s pattern word tokenize"""
print "tokens:", tokenize(f)
sent_tokenize_test = """Tokenization is the process of breaking a stream of text up into words, phrases, symbols, or other meaningful elements called tokens. The list of tokens becomes input for further processing such as parsing or text mining. Tokenization is useful both in linguistics (where it is a form of text segmentation), and in computer science, where it forms part of lexical analysis."""
print "sentence:",tokenize(sent_tokenize_test)


from pattern.en import tag

g = """In corpus linguistics, part-of-speech tagging (POS tagging or POST), also called grammatical tagging or word-category disambiguation, is the process of marking up a word in a text (corpus) as corresponding to a particular part of speech, based on both its definition, as well as its context—i.e. relationship with adjacent and related words in a phrase, sentence, or paragraph. A simplified form of this is commonly taught to school-age children, in the identification of words as nouns, verbs, adjectives, adverbs, etc."""
tagged_result = tag(g)

print tagged_result


from pattern.en import referenced
referenced('book')

from pattern.en import singularize
singularize('wolves')

from pattern.en import comparative
comparative('bad')
#‘worse’

from pattern.en import superlative

예제 #50

0

파일 보기

파일: NER-StanfordNLP-annotate.py 프로젝트: AvijitGhosh82/NLP

          text = str(text).replace('\xf7','').replace('\xc3\xba','').replace('\xb6','').replace('\xa9','').replace('\xe2\x99\xaa','')
          text = str(text).replace('\xc3\xaf','').replace('\x5c','').replace('\xf1','').replace('\xe1','').replace('\xe7','').replace('\xfa','')
          text = str(text).replace('\xf3','').replace('\xed','').replace('\xe9','').replace('\xe0','').replace('\xae','').replace('\xc2','')
          text = str(text).replace('\xc3','').replace('\xa2','').replace('\xbf','')
#         print text
      except IndexError:
          print line
          continue

# G. Remove clearly wrong unicode characters -- BOM, NULL (only utf8 hex works)
      line = str(line).replace('\x00 ','').replace('\xef\xbf\xbd','')
      print line,

# H. Ensure the text is split into sentences
     # tokenize(string, punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_", replace={})
      for sentence in tokenize(text):
         all = ""

# I. Select the parser
         if sentence.isupper() or sentence.islower(): st = UPP
         else: st = Mix

# J. Parts of speech with stanford-ner via pyner
         reply = st.get_entities(sentence)
         # {u'PERSON': [u'Bill Clinton'], u'LOCATION': [u'U.S.'], u'O': [u'was President of the']}
         try:
             for tup in reply.items():
                names = ""
                if tup[0] == "O" or not tup[0] : continue
                for name in tup[1]:
                   names = "".join([names,"/",name])