Python preprocess_text 예제들, textacy.preprocess_text Python 예제들

예제 #1

0

파일 보기

    def preprocess(self,
                   lowercase=False,
                   no_punct=False,
                   no_urls=False,
                   no_stop_words=False):
        """ Preprocess text for matching """

        # Break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in self.text.splitlines())

        # Break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines
                  for phrase in line.split(' '))

        # Drop blank lines
        self.text = '\n'.join(chunk for chunk in chunks if chunk)

        # Handle lowercase, no_pucnt and no_urls
        self.text = textacy.preprocess_text(self.text,
                                            lowercase=lowercase,
                                            no_punct=no_punct,
                                            no_urls=no_urls)

        # Handle stop words
        if no_stop_words:
            stop_words = set(stopwords.words('english'))
            word_tokens = nltk.word_tokenize(self.text)
            filtered_sentence = [w for w in word_tokens if w not in stop_words]
            self.text = ' '.join(filtered_sentence)

예제 #2

0

파일 보기

def retrieve_paragraphs():
    import textacy

    logging.info('Obtaining paragraphs')
    db = CederMT()
    training_collection = db.SynthesisTypeFilteringData
    syn_20170926 = db.syn_20170926

    paragraphs = list(
        training_collection.find(
            {
                'is_training_data': True,
                'human_validated': True
            }, {
                '_id': 0,
                'time_created': 0
            }))

    for paragraph in tqdm(paragraphs):
        paper = syn_20170926.find_one({
            'doi': paragraph['doi'],
        }, {
            '_id': 1,
            'paragraphs': {
                '$slice': [paragraph['paragraph_id'], 1]
            }
        })
        text = paper['paragraphs'][0]['text']
        paragraph['text'] = textacy.preprocess_text(text, fix_unicode=True)

    return paragraphs

예제 #3

0

파일 보기

파일: hashtag.py 프로젝트: edenbaus/NLP

def prepare_text(document):
    """
    textacy - spacy wrapper
    cleans up text in one go
    -lower case
    -remove numbers
    -remove punctuation
    -undo contractions
    -removes accents
    -fixes garbled unicode
    -removes currency symbols
    -string replace '\n' with '' too
    """
    text_processing = textacy.preprocess_text(
                        nlp(document).text.replace('-',' ').replace('\n',''),
                        fix_unicode=True,
                        lowercase=True,
                        transliterate=False,
                        no_urls=False,
                        no_emails=False,
                        no_phone_numbers=False,
                        no_numbers=True,
                        no_currency_symbols=True,
                        no_punct=True,
                        no_contractions=True,
                        no_accents=True
                    )
    prepared_text = nlp(text_processing)
    print ('cleaning text...')
    return (prepared_text)

예제 #4

0

파일 보기

def build_comp_termlist():
    import os
    spacy_lang = en_core_web_sm.load()
    texts = []
    for kag_path in glob.glob(KAG_BASE_PATH + '/*'):
        for comp_path in glob.glob(kag_path + '/*'):
            _, comp_name = os.path.split(comp_path)
            #            print ('=Handling competence ' + comp_name)
            comp_text = ''
            for filename in glob.glob(comp_path + '/*.txt'):
                clean_text = preprocess_text(open(filename,
                                                  'r').read().decode('utf-8'),
                                             no_urls=True,
                                             no_emails=True,
                                             no_phone_numbers=True,
                                             no_numbers=True,
                                             no_currency_symbols=True,
                                             no_punct=True,
                                             no_contractions=True,
                                             no_accents=True)
                comp_text = ' '.join(text for text in (comp_text, clean_text))


#                comp_text += open(filename, 'r').read().decode('utf-8')
            doc = textacy.Doc(comp_text, lang=spacy_lang)
            texts.append(
                doc.to_terms_list(named_entities=False, as_strings=True))
    return texts

예제 #5

0

파일 보기

파일: test_readme.py 프로젝트: chartbeat-labs/textacy

 def test_plaintext_functionality(self):
     expected_1 = 'mr president i ask to have printed in the record copies of some of the'
     observed_1 = textacy.preprocess_text(self.text, lowercase=True, no_punct=True)[:70]
     expected_2 = [('ed States of America is an amazing ',
                    'nation',
                    ' that continues to lead the world t'),
                   ('come the role model for developing ',
                    'nation',
                    's attempting to give their people t'),
                   ('ve before to better ourselves as a ',
                    'nation',
                    ', because what we change will set a'),
                   ('nd education. Fortunately, we as a ',
                    'nation',
                    ' have the opportunity to fix the in'),
                   (' sentences. Judges from across the ',
                    'nation',
                    ' have said for decades that they do'),
                   ('reopened many racial wounds in our ',
                    'nation',
                    '. The war on drugs also put addicts')]
     observed_2 = list(textacy.text_utils.keyword_in_context(
         self.text, 'nation', window_width=35, print_only=False))
     self.assertEqual(observed_1, expected_1)
     self.assertEqual(observed_2, expected_2)

예제 #6

0

파일 보기

파일: wmd.py 프로젝트: IlyasYOY/similar-articles-poc

def preprocess_text(from_article):
    return textacy.preprocess_text(from_article.get('title', '') + '. ' +
                                   from_article.get('abstract', ''),
                                   lowercase=True,
                                   transliterate=True,
                                   no_punct=True,
                                   no_numbers=True)

예제 #7

0

파일 보기

파일: Tagger.py 프로젝트: GeorgeFucker/DataRootTest

    def preprocess(self,
                   lowercase=False,
                   no_punct=False,
                   no_urls=False,
                   no_stop_words=False):
        """ Preprocess text for matching """

        # Break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in self.text.splitlines())

        # Break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines
                  for phrase in line.split(' '))

        # Drop blank lines
        self.text = '\n'.join(chunk for chunk in chunks if chunk)

        # Handle lowercase, no_pucnt and no_urls
        self.text = textacy.preprocess_text(self.text,
                                            lowercase=lowercase,
                                            no_punct=no_punct,
                                            no_urls=no_urls)

        # Handle stop words
        if no_stop_words:
            self.text = self._delete_stop_words(self.text)

예제 #8

0

파일 보기

파일: test_readme.py 프로젝트: pwin/textacy

 def test_plaintext_functionality(self):
     expected_1 = 'mr president i ask to have printed in the record copies of some of the'
     observed_1 = textacy.preprocess_text(self.text, lowercase=True, no_punct=True)[:70]
     expected_2 = [('ed States of America is an amazing ',
                    'nation',
                    ' that continues to lead the world t'),
                   ('come the role model for developing ',
                    'nation',
                    's attempting to give their people t'),
                   ('ve before to better ourselves as a ',
                    'nation',
                    ', because what we change will set a'),
                   ('nd education. Fortunately, we as a ',
                    'nation',
                    ' have the opportunity to fix the in'),
                   (' sentences. Judges from across the ',
                    'nation',
                    ' have said for decades that they do'),
                   ('reopened many racial wounds in our ',
                    'nation',
                    '. The war on drugs also put addicts')]
     observed_2 = list(textacy.text_utils.keyword_in_context(
         self.text, 'nation', window_width=35, print_only=False))
     self.assertEqual(observed_1, expected_1)
     self.assertEqual(observed_2, expected_2)

예제 #9

0

파일 보기

def soft_preprocess(df):
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(ftfy.fix_text)
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(clean_text)
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(lambda x: x.replace('"', "'").replace("\n", " "))
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(lambda text: textacy.preprocess_text(
        text, no_currency_symbols=True,no_urls=True,no_emails=True,no_phone_numbers=True,no_numbers=True))
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(fix_contractions)
    return df

예제 #10

0

파일 보기

def extract_keyphrases_from_text(text, spacy_en):
    str = preprocess_text(text,
                          fix_unicode=True,
                          lowercase=True,
                          transliterate=True)
    noun_phrases = [np.text for np in spacy_en(str).noun_chunks]
    # remove ones too short, lemmatize, etc..
    cleankeys = regularise_keys(noun_phrases)
    return cleankeys

예제 #11

0

파일 보기

def hard_preprocess(df):
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(ftfy.fix_text)
    df.iloc[:,1]=df.iloc[:,1].apply(fix_contractions)
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(clean_text)
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(lambda x: x.replace('"', '').replace("\n", " ").replace("\\",""))
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(lambda text: textacy.preprocess_text(
        text, no_currency_symbols=True,no_urls=True,no_emails=True,no_phone_numbers=True,no_numbers=True))
    df.iloc[:,1] = df.iloc[:,1].swifter.apply(lambda text: " ".join(
        [word for word in simple_preprocess(text) if word not in stop_words]).strip())
    return df

예제 #12

0

파일 보기

def read_corpus(corpus_path):
    #    spacy_lang = data.load_spacy('en')
    spacy_lang = en_core_web_sm.load()
    corpus = textacy.corpus.Corpus(u'en')
    for filename in glob.glob(corpus_path + '/*.txt'):
        content = open(filename, 'r').read().decode('utf-8')
        spacy_doc = spacy_lang(preprocess_text(content), parse=False)
        corpus.add_doc(spacy_doc)

    return corpus

예제 #13

0

파일 보기

 def preprocess_text_settings(string_in):
     string_in = preprocess_text(string_in,
                                 fix_unicode=True,
                                 lowercase=True,
                                 no_urls=True,
                                 no_emails=True,
                                 no_numbers=True,
                                 no_accents=True,
                                 no_punct=True)
     return (string_in)

예제 #14

0

파일 보기

    def get_textacy_doc(text):
        """
        Gets document of textacy library
        :param text: Text of which textacy doc to get
        :return: tuple Textacy doc, Processed text
        """
        en = textacy.load_spacy_lang(NLPService._WORD_MODEL_NAME,
                                     disable=('parser', ))
        processed_text = textacy.preprocess_text(text,
                                                 lowercase=True,
                                                 no_punct=True)

        return textacy.make_spacy_doc(processed_text, lang=en), processed_text

예제 #15

0

파일 보기

파일: nlp_qrmine.py 프로젝트: dermatologist/nlp-qrmine

 def process_content(self):
     if self._content is not None:
         ct = 0
         for document in self._content.documents:
             metadata = {}
             try:
                 metadata['title'] = self._content.titles[ct]
             except IndexError:
                 metadata['title'] = 'Empty'
             self._corpus.add_text(textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True),
                                   metadata=metadata)
             ct += 1
         self.load_matrix()

예제 #16

0

파일 보기

def preprocess_corpus_textacy(corpus):
    """Remove numbers, whitespace, punctuation and make lower case"""
    preprocessed = corpus.copy()
    for index, row in preprocessed.iteritems():
        preprocessed[index] = re.sub(
            r'\w*\d\w*', '', preprocessed[index]).strip()  #remove numbers
        preprocessed[index] = textacy.preprocess.normalize_whitespace(
            preprocessed[index])
        preprocessed[index] = textacy.preprocess_text(preprocessed[index],
                                                      lowercase=True,
                                                      no_punct=True)
    print('preprocessed corpus')
    return preprocessed

예제 #17

0

파일 보기

def read_clean_text(file):
    f = open('./data/FixedJudgements/' + file + '.txt')
    raw = f.read()
    text = drop_html(raw)
    text = textacy.preprocess.unpack_contractions(text)
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    text = regex.sub(' ', text)
    text = text.replace('\n', ' ')
    text = preprocess_text(text,
                           lowercase=True,
                           fix_unicode=True,
                           no_numbers=True)
    return text

예제 #18

0

파일 보기

 def test_plaintext_functionality(self):
     preprocessed_text = textacy.preprocess_text(
         self.text, lowercase=True, no_punct=True)[:100]
     self.assertTrue(
         all(char.islower() for char in preprocessed_text if char.isalpha()))
     self.assertTrue(
         all(char.isalnum() or char.isspace() for char in preprocessed_text))
     keyword = 'America'
     kwics = textacy.text_utils.keyword_in_context(
         self.text, keyword, window_width=35, print_only=False)
     for pre, kw, post in kwics:
         self.assertEqual(kw, keyword)
         self.assertIsInstance(pre, textacy.compat.unicode_)
         self.assertIsInstance(post, textacy.compat.unicode_)

예제 #19

0

파일 보기

파일: test_readme.py 프로젝트: xuezhizeng/textacy

def test_plaintext_functionality(text):
    preprocessed_text = preprocess_text(text, lowercase=True,
                                        no_punct=True)[:100]
    assert all(char.islower() for char in preprocessed_text if char.isalpha())
    assert all(char.isalnum() or char.isspace() for char in preprocessed_text)
    keyword = 'America'
    kwics = text_utils.keyword_in_context(text,
                                          keyword,
                                          window_width=35,
                                          print_only=False)
    for pre, kw, post in kwics:
        assert kw == keyword
        assert isinstance(pre, compat.unicode_)
        assert isinstance(post, compat.unicode_)

예제 #20

0

파일 보기

def read_texts(corpus_path):
    #    spacy_lang = data.load_spacy('en')
    spacy_lang = en_core_web_sm.load()
    #    corpus = textacy.corpus.Corpus(u'en')
    texts = []
    for filename in glob.glob(corpus_path + '/*.txt'):
        print "Keyterms for " + filename
        content = open(filename, 'r').read().decode('utf-8')
        texts.append(content)


#        break # Just for one doc for testing purposes
    text = '\n'.join(content for content in texts)
    return spacy_lang(preprocess_text(text), parse=False)

예제 #21

0

파일 보기

파일: get_live_hashtag_sentiment.py 프로젝트: tyleratk/twitter-sentiment

    def show_plot(self, pos, neg, plot_type='semantic'):
        for group in [('Positive', pos), ('Negative', neg)]:
            name, group = group
            if plot_type == 'semantic':
                # if name == 'Positive' and group.shape[0] > 150:
                #     group = group.sample(155)
                corpus = [self.prep_tokens(tweet) for tweet in group]
                corpus = ' '.join(word for word in corpus)
                cleaned_text = textacy.preprocess_text(corpus,
                                                       fix_unicode=True,
                                                       no_accents=True)
                doc = textacy.Doc(cleaned_text, lang='en')
                graph = doc.to_semantic_network(nodes='words',
                                                edge_weighting='cooc_freq',
                                                window_width=10)
                drop_nodes = ['pron']
                for node in drop_nodes:
                    try:
                        graph.remove_node(node)
                    except:
                        pass
                node_weights = nx.pagerank_scipy(graph)
                ax = textacy.viz.network.draw_semantic_network(
                    graph, node_weights=node_weights, spread=50.0)
                plt.suptitle(name + ' Sentiment Topics:' +
                             '\n{} {} tweets\n{}'.format(
                                 group.shape[0], name, self.hashtag))
                # plt.savefig('../images/plots/' + name)
            else:
                corpus = [self.prep_tokens(tweet) for tweet in group]
                tf = TfidfVectorizer().fit(corpus)

                doc_term_matrix = tf.transform(corpus)
                vocab = tf.get_feature_names()
                vocab = [word for word in vocab if word != 'pron']

                model = textacy.tm.TopicModel('nmf', n_topics=3)
                model.fit(doc_term_matrix)
                model.termite_plot(doc_term_matrix,
                                   vocab,
                                   topics=-1,
                                   n_terms=25,
                                   sort_terms_by='seriation',
                                   rank_terms_by='topic_weight',
                                   highlight_topics=range(3))
                plt.suptitle(name + ' Sentiment Topics:')
                # plt.savefig('semantic_plot')

        plt.show(block=False)

예제 #22

0

파일 보기

파일: utils.py 프로젝트: abiraja2004/DuQI

def normalize_text(text):
    text = textacy.preprocess.normalize_whitespace(
        textacy.preprocess.transliterate_unicode(str(text)))
    return textacy.preprocess_text(text,
                                   fix_unicode=False,
                                   lowercase=False,
                                   transliterate=False,
                                   no_urls=True,
                                   no_emails=True,
                                   no_phone_numbers=True,
                                   no_numbers=False,
                                   no_currency_symbols=True,
                                   no_punct=False,
                                   no_contractions=True,
                                   no_accents=True)

예제 #23

0

파일 보기

def hard_preprocess(df):
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(ftfy.fix_text)
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(clean_text)
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(lambda x: x.replace('"', '').replace("\n", " ").replace("\\",""))
    df.iloc[:,1]=df.iloc[:,1].swifter.apply(lambda text: textacy.preprocess_text(
        text, no_currency_symbols=True,no_urls=True,no_emails=True,no_phone_numbers=True,no_numbers=True))
    df.iloc[:,1] = df.iloc[:,1].apply(lambda x: x.replace("`","'").replace(
                                        "& amp ;", " and ").replace(
                                        "@ USER", "[USER]").replace(
                                        "@ URL", "[URL]").replace(
                                        "@ HASHTAG", "[HASHTAG]").replace(
                                        "*NUMBER*", "[NUMBER]")
                                     )
    df.iloc[:,1]=df.iloc[:,1].apply(fix_contractions)
    return df

예제 #24

0

파일 보기

    def setUp(self):
        spacy_lang = data.load_spacy('en_core_web_sm')
        text = """
        Friedman joined the London bureau of United Press International after completing his master's degree. He was dispatched a year later to Beirut, where he lived from June 1979 to May 1981 while covering the Lebanon Civil War. He was hired by The New York Times as a reporter in 1981 and re-dispatched to Beirut at the start of the 1982 Israeli invasion of Lebanon. His coverage of the war, particularly the Sabra and Shatila massacre, won him the Pulitzer Prize for International Reporting (shared with Loren Jenkins of The Washington Post). Alongside David K. Shipler he also won the George Polk Award for foreign reporting.

        In June 1984, Friedman was transferred to Jerusalem, where he served as the New York Times Jerusalem Bureau Chief until February 1988. That year he received a second Pulitzer Prize for International Reporting, which cited his coverage of the First Palestinian Intifada. He wrote a book, From Beirut to Jerusalem, describing his experiences in the Middle East, which won the 1989 U.S. National Book Award for Nonfiction.

        Friedman covered Secretary of State James Baker during the administration of President George H. W. Bush. Following the election of Bill Clinton in 1992, Friedman became the White House correspondent for the New York Times. In 1994, he began to write more about foreign policy and economics, and moved to the op-ed page of The New York Times the following year as a foreign affairs columnist. In 2002, Friedman won the Pulitzer Prize for Commentary for his "clarity of vision, based on extensive reporting, in commenting on the worldwide impact of the terrorist threat."

        In February 2002, Friedman met Saudi Crown Prince Abdullah and encouraged him to make a comprehensive attempt to end the Arab-Israeli conflict by normalizing Arab relations with Israel in exchange for the return of refugees alongside an end to the Israel territorial occupations. Abdullah proposed the Arab Peace Initiative at the Beirut Summit that March, which Friedman has since strongly supported.

        Friedman received the 2004 Overseas Press Club Award for lifetime achievement and was named to the Order of the British Empire by Queen Elizabeth II.

        In May 2011, The New York Times reported that President Barack Obama "has sounded out" Friedman concerning Middle East issues.
        """
        self.spacy_doc = spacy_lang(preprocess_text(text), parse=False)

예제 #25

0

파일 보기

    def process_content(self):
        if self._content is not None:
            ct = 0
            for document in self._content.documents:
                metadata = {}
                try:
                    metadata['title'] = self._content.titles[ct]
                except IndexError:
                    metadata['title'] = 'Empty'
                # self._corpus.add_text(textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True),
                #                       metadata=metadata)
                doc_text = textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True)
                doc = textacy.make_spacy_doc((doc_text, metadata), lang=self._en)
                self._corpus.add_doc(doc)
                ct += 1

            self.load_matrix()

예제 #26

0

파일 보기

def build_document(df):
    '''
    Rebuilding a representation of a document from the
    sentences_nlp352 format.

    Parameters
    ----------
    df : pandas dataframe
    '''
    # Find unique documents
    document_ids = pd.Series(df.docid.unique())

    for docid in document_ids[:1]:
        doc = df[df.docid == docid]['word']
        cleaned_doc = textacy.preprocess_text(' '.join(doc.str.cat().replace(
            '{', '').replace('}', '').replace('.', ' . ').split(',')))

        return cleaned_doc.replace('" "', '')

예제 #27

0

파일 보기

def read_cv(cv_id):
    import json
    spacy_lang = en_core_web_sm.load()
    corpus = textacy.corpus.Corpus(spacy_lang)
    filename = 'cv_{0}.json'.format(cv_id)
    cv_path = CV_PATH + filename
    content = ''
    with open(cv_path) as cv_file:
        content = json.load(cv_file)
    corpus_text = content['description']
    #    corpus.add_text(preprocess_text(corpus_text, no_urls=True, no_emails=True,
    #                                    no_phone_numbers=True, no_numbers=True, no_currency_symbols=True,
    #                                    no_punct=True, no_contractions=True, no_accents=True))
    corpus.add_text(
        preprocess_text(corpus_text,
                        no_punct=True,
                        no_contractions=True,
                        no_accents=True))
    return corpus

예제 #28

0

파일 보기

def read_cv2(cv_path):
    import json
    spacy_lang = en_core_web_sm.load()
    corpus = textacy.corpus.Corpus(spacy_lang)
    content = ''
    with open(cv_path) as cv_file:
        content = json.load(cv_file)


#    corpus_text = content['description']
    corpus_text = content.get('description', u'')
    #    corpus.add_text(preprocess_text(corpus_text, no_urls=True, no_emails=True,
    #                                    no_phone_numbers=True, no_numbers=True, no_currency_symbols=True,
    #                                    no_punct=True, no_contractions=True, no_accents=True))
    corpus.add_text(
        preprocess_text(corpus_text,
                        no_punct=True,
                        no_contractions=True,
                        no_accents=True))
    return corpus

예제 #29

0

파일 보기

파일: get_hashtag_sent.py 프로젝트: tyleratk/twitter-sentiment

    def show_plot(self, pos, neg, plot_type='semantic'):
        for group in [('Positive', pos), ('Negative', neg)]:
            name, group = group
            if plot_type == 'semantic':
                corpus = [self.prep_tokens(tweet) for tweet in group]
                corpus = ' '.join(word for word in corpus)
                cleaned_text = textacy.preprocess_text(corpus, fix_unicode=True,
                                                       no_accents=True)
                doc = textacy.Doc(cleaned_text, lang='en')
                graph = doc.to_semantic_network(nodes='words', edge_weighting='cooc_freq', 
                                                window_width=10)
                # drop_nodes = [textacy.spacy_utils.normalized_str(tok)
                #               for tok in doc.tokens]
                # import pdb; pdb.set_trace()
                drop_nodes = ['pron']
                for node in drop_nodes:
                    try:
                        graph.remove_node(node)
                    except:
                        pass
                node_weights = nx.pagerank_scipy(graph)
                ax = textacy.viz.network.draw_semantic_network(graph, 
                                node_weights=node_weights, spread=75.0)
                plt.suptitle(name + ' Sentiment Topics:')
            else:
                corpus = [self.prep_tokens(tweet) for tweet in group]
                tf = TfidfVectorizer().fit(corpus)
            
                doc_term_matrix = tf.transform(corpus)
                vocab = tf.get_feature_names()
                vocab = [word for word in vocab if word != 'pron']
            
                model = textacy.tm.TopicModel('nmf', n_topics=3)
                model.fit(doc_term_matrix)            
                model.termite_plot(doc_term_matrix, vocab, topics=-1,
                                   n_terms=25, sort_terms_by='seriation',
                                   rank_terms_by='topic_weight',
                                   highlight_topics=range(3))
                plt.suptitle(name + ' Sentiment Topics:')

        plt.show(block=False)

예제 #30

0

파일 보기

def read_corpus_in_doc(corpus_path):
    spacy_lang = en_core_web_sm.load()
    corpus = textacy.corpus.Corpus(spacy_lang)
    texts = []
    for filename in glob.glob(corpus_path + '/*.txt'):
        content = open(filename,
                       'r').read().decode('utf-8')  # testing preprocess
        clean_text = preprocess_text(content,
                                     no_punct=True,
                                     no_contractions=True,
                                     no_accents=True)
        texts.append(clean_text)


#        texts.append(open(filename, 'r').read().decode('utf-8'))
#        break
    corpus_text = '\n'.join(text for text in texts)
    corpus.add_text(corpus_text)
    #        corpus.add_text(spacy_lang(preprocess_text(content), parse=False))
    #        break # Just for one doc for testing purposes
    return corpus

예제 #31

0

파일 보기

def dependecy_parsing():
    """Dependecy Parsing
    The parser also powers the sentence boundary detection, and lets you iterate over
    base noun phrases, or "chunks". You can check whether a Doc  object has been parsed
    with the doc.is_parsed attribute, which returns a boolean value. If this attribute
    is False, the default sentence iterator will raise an exception.
    """
    data = request.get_json()
    result = data["data"]
    result = textacy.preprocess_text(result, fix_unicode=True, no_accents=True)
    result = textacy.preprocess.fix_bad_unicode(result, normalization="NFC")
    doc = ner_model(result)

    dependency = [({
        "text": token.text,
        "dependency": token.dep_,
        "tokenHead": token.head.text,
        "tokenHeadPartOfSpeach": token.head.pos_,
        "children": [str(child) for child in token.children],
    }) for token in doc]
    result = jsonify({"dependency": dependency})
    return result

예제 #32

0

파일 보기

파일: __init__.py 프로젝트: IlyasYOY/similar-articles-poc

def iter_id_with_text(filename, chunk_size=1000):
    log.info('Loading spaCy model...')
    nlp = spacy.load('en', disable=['ner', 'parser', 'tagger'])
    log.info('SpaCy model was loaded...')
    with open(filename) as file:
        for index, article in enumerate(map(json.loads, file), 1):
            abstract = article.get('abstract', '')
            title = article.get('title', '')
            text = textacy.preprocess_text(title + '. ' + abstract,
                                           lowercase=True,
                                           transliterate=True,
                                           no_punct=True,
                                           no_numbers=True)
            terms_list = list(
                textacy.Doc(text, lang=nlp).to_terms_list(as_strings=True,
                                                          named_entities=False,
                                                          normalize='lemma',
                                                          ngrams=(1)))
            if index % chunk_size == 0:
                log.info('%d articles were loaded...', index)
            id = article['id']
            yield id, [term for term in terms_list if term not in STOPWORDS]

예제 #33

0

파일 보기

파일: NGramTask.py 프로젝트: cahilton/ClarityNLPOptimized

    def run_custom_task(self, temp_file, mongo_client: MongoClient):
        print('run custom task')
        n_num = self.get_integer('n', default=2)
        filter_stops = self.get_boolean('filter_stops', default=True)
        filter_punct = self.get_boolean('filter_punct', default=True)
        filter_nums = self.get_boolean('filter_nums', default=False)
        lemmas = self.get_boolean('lemmas', default=True)
        limit_to_termset = self.get_boolean('limit_to_termset', default=False)
        termset = self.pipeline_config.terms
        if not termset:
            termset = list()
        lower_termset = [x.lower() for x in termset]

        for doc in self.docs:
            ngrams = list()
            cln_txt = self.get_document_text(doc, clean=True)
            t_doc = Doc(preprocess_text(cln_txt, lowercase=True))
            res = extract.ngrams(t_doc, n_num, filter_stops=filter_stops, filter_punct=filter_punct,
                                 filter_nums=filter_nums)
            for r in res:
                if lemmas:
                    text = r.lemma_
                else:
                    text = r.text

                if limit_to_termset:
                    for t in lower_termset:
                        if text == t or t in text:
                            ngrams.append({
                                'text': text,
                                'count': 1
                            })
                else:
                    ngrams.append({
                        'text': text,
                        'count': 1
                    })
            self.write_multiple_result_data(temp_file, mongo_client, doc, ngrams)