def simplify_compound_subjects(sentence_str):
    """Given a sentence doc, return a new sentence doc with compound subjects
    reduced to their simplest forms.

    'The man, the boy, and the girl went to school.'

    would reduce to 'They went to school'

    'The man, the boy, or the girls are frauds.'

    would reduce to 'The girls are frauds.'

    Sentences without a compund subject will not be changed at all."""

    sentence_doc = textacy.Doc(sentence_str, lang='en_core_web_lg')

    cs_patterns = \
            [r'((<DET>?(<NOUN|PROPN>|<PRON>)+<PUNCT>)+<DET>?(<NOUN|PROPN>|<PRON>)+<PUNCT>?<CCONJ><DET>?(<NOUN|PROPN>|<PRON>)+)|'\
            '(<DET>?(<NOUN|PROPN>|<PRON>)<CCONJ><DET>?(<NOUN|PROPN>|<PRON>))']

    for cs_pattern in cs_patterns:

        compound_subjects = textacy.extract.pos_regex_matches(
            sentence_doc, cs_pattern)

        chars_to_repl = [
        ]  # [(start_repl, end_repl, replacement), (start_repl,
        # end_repl, replacement), ...]

        for cs in compound_subjects:
            for w in cs:
                if w.pos_ == 'CCONJ' and w.text.lower() == 'and':
                    # replace with they
                    repl = 'they'.ljust(len(cs.text),
                                        '文')  # pad w unexpected char
                    chars_to_repl.append(
                        [cs[0].idx, cs[-1].idx + len(cs[-1].text), repl])

                elif w.pos_ == 'CCONJ' and w.text.lower() != 'and':
                    # replace with final <DET>?(<NOUN|PROPN>|<PRON>)
                    repl = cs[-1:].text
                    if cs[-2].pos_ == 'DET':
                        repl = cs[-2:].text
                    repl = repl.ljust(len(cs.text),
                                      '文')  # pad w unexpected char

                    chars_to_repl.append(
                        [cs[0].idx, cs[-1].idx + len(cs[-1].text), repl])

        new_sent_str = sentence_doc.text
        for replacement in chars_to_repl:
            new_sent_str = new_sent_str[:replacement[0]] + replacement[2] + \
                    new_sent_str[replacement[1]:]
        new_sent_str = new_sent_str.replace('文', '')
        new_sent_str = re.sub('\s+', ' ', new_sent_str).strip()

    sentence_doc = textacy.Doc(new_sent_str, lang='en_core_web_lg')
    return sentence_doc
예제 #2
0
def headline_to_svo(text):
    text_str = ''.join(filter(lambda x: x in string.printable, text))
    text_lower = text_str.lower()
    d1 = textacy.Doc(text_lower, lang=u"en")
    text_lower_str = str(text_lower)
    vs = textacy.extract.get_main_verbs_of_sent(d1)
    for v in vs:
        v_str = str(v)
        idx = text_lower_str.index(v_str)
        text_str = text_str[:idx] + v_str + text_str[idx + len(v):]
    d = textacy.Doc(text_str, lang=u"en")
    svo = textacy.extract.subject_verb_object_triples(d)
    return next(svo, None)
def kwd(sentence):

    try:
        txt_doc = textacy.Doc(sentence, lang="en_core_web_sm")
        kwds_sgrank = textacy.keyterms.sgrank(txt_doc, ngrams=(1,2), n_keyterms=3)
        kwds_sgrank = [kwd[0] for kwd in kwds_sgrank]
        kwds_sgrank_str = ', '.join(kwds_sgrank)
        kwds_textrank = textacy.keyterms.textrank(txt_doc)
        kwds_txtrnk = [kwd[0] for kwd in kwds_textrank]
        kw_uni = []
        for kw in kwds_txtrnk:

            if kw not in kwds_sgrank_str:
                kw_uni.append(kw)

        kwds = kwds_sgrank + kw_uni
        l_kwds = [kwd for kwd in kwds]
        # remove numbers from keywords
        l_kwds = [kwd for kwd in l_kwds if not any(str(v).isdigit() for v in kwd)]
        # removing very small keywords (to reduce noise)
        l_kwds = [kwd for kwd in l_kwds if len(kwd)>3]
        kwds = l_kwds

    except:
        kwds = []

    return kwds
예제 #4
0
def build_comp_termlist():
    import os
    spacy_lang = en_core_web_sm.load()
    texts = []
    for kag_path in glob.glob(KAG_BASE_PATH + '/*'):
        for comp_path in glob.glob(kag_path + '/*'):
            _, comp_name = os.path.split(comp_path)
            #            print ('=Handling competence ' + comp_name)
            comp_text = ''
            for filename in glob.glob(comp_path + '/*.txt'):
                clean_text = preprocess_text(open(filename,
                                                  'r').read().decode('utf-8'),
                                             no_urls=True,
                                             no_emails=True,
                                             no_phone_numbers=True,
                                             no_numbers=True,
                                             no_currency_symbols=True,
                                             no_punct=True,
                                             no_contractions=True,
                                             no_accents=True)
                comp_text = ' '.join(text for text in (comp_text, clean_text))


#                comp_text += open(filename, 'r').read().decode('utf-8')
            doc = textacy.Doc(comp_text, lang=spacy_lang)
            texts.append(
                doc.to_terms_list(named_entities=False, as_strings=True))
    return texts
예제 #5
0
    def extractNamedEntitiesAndCreateTextList(self, wordsWithoutStopWordsList):
        """
        Extract named entities from a given text and create a 
        list of all words in the article's text. In this case, 
        the named entites will be stored as one item, such as [...,'Angela Merkel, 'said',...]
        :text: String
        :returns: List
        """

        # transform text list to string, since removeStopWordsFromText() returns a list
        text = " ".join(w for w in wordsWithoutStopWordsList)

        # TODO: handle all used languages (en, es, de)
        doc = textacy.Doc(text, lang='en')
        entities = list(textacy.extract.named_entities(doc, exclude_types='numeric'))

        named_entities = [str(ent) for ent in entities]

        # replace named entites in text with 'tmpN' string
        for i, en in enumerate(named_entities):
            # Replace the first occuring named entity with the tmpN value. This is very important,
            # as there would be various tmp1 for example and the system wouldn't be able to change the values
            # in the next step.
            text = text.replace(en, 'tmp{}'.format(i), 1)

        # text string to list
        text = text.split()

        # change the 'tmpN' string in text list with named entity in entities list
        for j, ent in enumerate(named_entities):
            text[text.index('tmp{}'.format(j))] = ent

        # return the new text as list, that contains all words and named antities
        return text
예제 #6
0
def similaritytoselection(selection,corpus):
    #selection is any selected text string e.g. model answer or any response selected from student responses
    #returns a list of tuples in order from most to least similar to selected text (on a scale of 1 - 0) where tuple is (similarity, selectedtext)
    s = textacy.Doc(selection)
    simlist = [(textacy.similarity.word2vec(s,doc), doc.text) for doc in corpus]
    simlist.sort(key = lambda v:-v[0])
    return simlist
예제 #7
0
def hola(request):
    datos = []
    nlp = spacy.load("es_core_news_md")

    if request.method == "POST":
        text_box_value = request.POST['text_box']
        print(text_box_value)

    contenido = text_box_value
    contenido = nlp(contenido)
    docs = textacy.Doc(contenido)
    sentencias = [s for s in docs.sents]
    print(len(sentencias))
    tipos = set(ent.label_ for ent in contenido.ents)

    def cleanup(token, lower=True):
        if lower:
            token = token.lower()
        return token.strip()

    labels = set([w.label_ for w in contenido.ents])
    personas = ""
    for label in labels:
        entities = [
            cleanup(e.string, lower=False) for e in contenido.ents
            if label == e.label_
        ]
        entities = list(set(entities))
        if label == "PER":
            personas = entities
        print(label, entities)

    print("Esta son las entidades personas")

    return render(request, 'hola.html', {'datos': datos})
예제 #8
0
    def find_phrases(self, sentence, stop_tokens):
        doc = nlp(sentence)
        doc_grams = []
        unigrams = []
        for i in doc.noun_chunks:
            text = " ".join(
                [t.lemma_ if t.lemma_ != "-PRON-" else t.text for t in i])
            tokens = [
                t for t in text.split() if t != "" and t not in stop_tokens
            ]
            unigrams.extend(
                list(filter(lambda word: self.is_valid_word(word), tokens)))
            grams = self.generate_ngrams(tokens, 3)
            grams.extend(self.generate_ngrams(tokens, 2))
            for word in grams:
                if word not in stop_tokens:
                    doc_grams.append(space_join(word))

        pattern = r'<VERB>?<ADV>*<VERB>+'
        doc = textacy.Doc(sentence, lang=model)
        lists = textacy.extract.pos_regex_matches(doc, pattern)
        verbs_list = []
        for l in lists:
            verb_tokens = l.lemma_.split()
            for verb in verb_tokens:
                if verb not in stop_tokens and self.is_valid_word(verb):
                    verbs_list.append(verb)
        return doc_grams, unigrams, verbs_list
예제 #9
0
def keywords():
    #print request.get_json()
    arg = request.get_json()
    doc = textacy.Doc(arg['content'],
                      metadata={'title': arg['title']},
                      lang=unicode('en_core_web_sm'))
    sgrank_keywords = dict(keyterms.sgrank(doc))
    singlerank_keywords = dict(keyterms.singlerank(doc))
    textrank_keywords = dict(keyterms.textrank(doc))
    sgrank_keywords.update((x, y * 0.9) for x, y in sgrank_keywords.items())
    textrank_keywords.update(
        (x, y * 0.05) for x, y in textrank_keywords.items())
    singlerank_keywords.update(
        (x, y * 0.05) for x, y in singlerank_keywords.items())
    keywords = res = dict(
        Counter(sgrank_keywords) + Counter(textrank_keywords) +
        Counter(singlerank_keywords))
    sorted_keywords = sorted(keywords.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    keyword_string = ""

    for i, key in enumerate(sorted_keywords):
        if (i == int(len(sorted_keywords) / 2)):
            keyword_string = keyword_string + "||"
        if (i == len(sorted_keywords) - 1
                or i == int(len(sorted_keywords) / 2) - 1):
            keyword_string = keyword_string + key[0]
        else:
            keyword_string = keyword_string + key[0] + ",,"

    return keyword_string
    def __init__(self, infile):
        efc = ExtractFileContents()
        print("INIT: Extract contents for infile: ", infile)

        input_file_contents = efc.extractContents(infile)
        metadata = {'filename': infile}
        self.doc = textacy.Doc(input_file_contents, metadata=metadata)
예제 #11
0
def label_kwd(sentence):

    try:
        txt_doc = textacy.Doc(sentence, lang="en_core_web_sm")
        kwds_sgrank = textacy.keyterms.sgrank(txt_doc, ngrams=(1, 2))
        kwds_sgrank = [kwd[0] for kwd in kwds_sgrank]
        kwds_sgrank_str = ', '.join(kwds_sgrank)
        kwds_textrank = textacy.keyterms.textrank(txt_doc)
        kwds_txtrnk = [kwd[0] for kwd in kwds_textrank]
        kw_uni = []
        for kw in kwds_txtrnk:

            if kw not in kwds_sgrank_str:
                kw_uni.append(kw)

        kwds = kwds_sgrank + kwds_txtrnk
        l_kwds = [kwd for kwd in kwds]
        # remove numbers from keywords
        l_kwds = [
            kwd for kwd in l_kwds if not any(str(v).isdigit() for v in kwd)
        ]
        # removing very small keywords (to reduce noise)
        l_kwds = [kwd for kwd in l_kwds if len(kwd) > 3]
        kwds = ','.join(l_kwds)
        final_data = sentence + '\t' + kwds + '\n'

    except:
        final_data = ''

    print(final_data)
    print()
    return final_data
예제 #12
0
def remove_adverbial_clauses(sentence_str):
    """Given a string, drop any adverbial clauses."""
    # should also return updated indexes
    # Sam , worried, asked him.
    # [0, 0, 0, 0, 3, 3, 3]
    tdoc = textacy.Doc(sentence_str, lang='en_core_web_lg')
    advcl_phrases = []  #=> [(start.i, end.i), ...]
    has_advcl = False
    start = None
    for w in tdoc:
        if w.tag_ == ',' and has_advcl:  # end phrase, start next
            if start:  # end phrase if started
                advcl_phrases.append((start.i, w.i))
            start = w
            has_advcl = False
        elif w.tag_ == ',':  # start phrase
            start = w
            has_advcl = False
        if w.dep_ == 'advcl':
            has_advcl = True

    new_sent_str = sentence_str
    unusual_char = '形'
    for advcl in advcl_phrases:
        start = tdoc[advcl[0]].idx
        end = tdoc[advcl[1]].idx + len(tdoc[advcl[1]].text)
        sub = unusual_char * (end - start)
        new_sent_str = new_sent_str[:start] + sub + new_sent_str[end:]
    new_sent_str = new_sent_str.replace(unusual_char, '')
    return new_sent_str
예제 #13
0
    def phrases(self, clean_text):
        all_lemmas = lemmatize(clean_text, stopwords=self.stopwords)
        curated_words = [str(word).split('/')[0] for word in all_lemmas]
        curated_text = ' '.join(curated_words)

        doc = textacy.Doc(curated_text, lang='en')

        all_phrases = []
        all_phrases += textacy.extract.ngrams(doc,
                                              2,
                                              filter_stops=True,
                                              filter_punct=True,
                                              filter_nums=True)
        all_phrases += textacy.extract.ngrams(doc,
                                              3,
                                              filter_stops=True,
                                              filter_punct=True,
                                              filter_nums=True)
        all_phrases += textacy.extract.ngrams(doc,
                                              4,
                                              filter_stops=True,
                                              filter_punct=True,
                                              filter_nums=True)
        all_phrases += textacy.extract.ngrams(doc,
                                              5,
                                              filter_stops=True,
                                              filter_punct=True,
                                              filter_nums=True)

        phrases = [str(phrase) for phrase in all_phrases]

        return phrases
def substitute_infinitives_as_subjects(sent_str):
    """If an infinitive is used as a subject, substitute the gerund."""
    sent_doc = textacy.Doc(sent_str, lang='en_core_web_lg')
    #inf_pattern = r'<PART><VERB>+' # To aux/auxpass* csubj
    inf_pattern = r'<PART><VERB>'  # To aux/auxpass* csubj
    infinitives = textacy.extract.pos_regex_matches(sent_doc, inf_pattern)
    inf_subjs = []  # => [[0,1],...]
    for inf in infinitives:
        if inf[0].text.lower() != 'to':
            continue
        if ('csubj' not in [w.dep_ for w in inf]
                and sent_doc[inf[-1].i + 1].dep_ != 'csubj'):
            continue
        if inf[-1].tag_ != 'VB':
            continue
        inf_subj = []
        for v in inf:
            inf_subj.append(v.i)
        inf_subjs.append(inf_subj)
    new_sent_str = sent_str
    unusual_char = '形'
    for inf_subj in inf_subjs:
        start_inf = sent_doc[inf_subj[0]].idx
        end_inf = sent_doc[inf_subj[-1]].idx + len(sent_doc[inf_subj[-1]])
        inf_len = end_inf - start_inf
        sub = (unusual_char * inf_len)
        new_sent_str = new_sent_str[:start_inf] + sub + new_sent_str[end_inf:]
    new_sent_str = re.sub('形+', '{}', new_sent_str)
    repl = [
        conjugate(sent_doc[i_s[-1]].text, tense='presentparticiple')
        for i_s in inf_subjs
    ]
    return new_sent_str.format(*repl)
예제 #15
0
def v_phrase_scores(sentence=None, phrase_type='verb'):

    if (phrase_type == 'verb'):
        pattern = r'<VERB>?<ADV>*<VERB>+'
    elif (phrase_type == 'noun'):
        pattern = r'<DET>? (<NOUN>+ <ADP|CONJ>)* <NOUN>+'
    else:
        print('Unregognized phrase type')
        return

    doc = textacy.Doc(sentence, lang='en_core_web_sm')
    lists = textacy.extract.pos_regex_matches(doc, pattern)

    tot_phrase_score = {}
    tot_phrase_score['pos'] = 0
    tot_phrase_score['neg'] = 0
    tot_phrase_score['neu'] = 0
    tot_phrase_score['compound'] = 0
    for list in lists:
        phrase_score = v_sentiment_scores(list.text)
        tot_phrase_score['pos'] += phrase_score['pos']
        tot_phrase_score['neg'] += phrase_score['neg']
        tot_phrase_score['neu'] += phrase_score['neu']
        tot_phrase_score['compound'] += phrase_score['compound']
    return tot_phrase_score
예제 #16
0
def word_stats(tree):
    """Returns a bunch of textacy stats on the text"""
    stats = None
    try:
        text = tree.find("mainText").text
        doc = textacy.Doc(text)
        stats = textacy.text_stats.TextStats(doc)
    except Exception as e:
        print(e)
    return stats
예제 #17
0
 def setUp(self):
     self.spacy_lang = textacy.data.load_spacy('en')
     self.cw = textacy.datasets.CapitolWords()
     self.text = list(self.cw.texts(speaker_name={'Bernie Sanders'}, limit=1))[0]
     self.doc = textacy.Doc(self.text.strip(), lang=self.spacy_lang)
     records = self.cw.records(speaker_name={'Bernie Sanders'}, limit=10)
     text_stream, metadata_stream = textacy.fileio.split_record_fields(
         records, 'text')
     self.corpus = textacy.Corpus(
         self.spacy_lang, texts=text_stream, metadatas=metadata_stream)
def raise_infinitive_error(sentence_str):
    """Given a string, check that all infinitives are properly formatted"""
    sent_doc = textacy.Doc(sentence_str, lang='en_core_web_lg')
    inf_pattern = r'<PART|ADP><VERB>'  # To aux/auxpass* csubj
    infinitives = textacy.extract.pos_regex_matches(sent_doc, inf_pattern)
    for inf in infinitives:
        if inf[0].text.lower() != 'to':
            continue
        if inf[-1].tag_ != 'VB':
            raise Exception('InfinitivePhraseError')
예제 #19
0
def get_new_doc(phrase, doc_type='textacy'):
    assert isinstance(phrase, basestring)
    assert isinstance(doc_type, str)
    assert doc_type in possible_docs, "Only {} doc types are supported".format(
        possible_docs)

    if doc_type == 'textacy':
        return textacy.Doc(phrase, lang=lang_en)
    elif doc_type == 'spacy':
        return nlp(phrase)
예제 #20
0
    def wordTuples(graph, textEntry):
        text = rootify(graph, textEntry)
        pt = textacy.load_spacy('pt')
        doc = textacy.Doc(text, lang=pt)
        ts = textacy.TextStats(doc)
        words = [{
            w[0]: w[1]
        } for w in textacy.keyterms.textrank(
            doc, normalize='lower', n_keyterms=ts.n_unique_words)]

        return words
예제 #21
0
def extractNamedEntities(query: str, language: str) -> list:
    """
    Extract Phrases from a given query sentence string
    :query: String
    :language: String
    :returns: List
    """
    doc = textacy.Doc(query, lang=language)
    entities = list(textacy.extract.named_entities(doc))

    return [str(ent) for ent in entities]
def split_infinitive_warning(sentence_str):
    """Return a warning for a split infinitive, else, None"""
    sent_doc = textacy.Doc(sentence_str, lang='en_core_web_lg')
    inf_pattern = r'<PART><ADV><VERB>'  # To aux/auxpass* csubj
    infinitives = textacy.extract.pos_regex_matches(sent_doc, inf_pattern)
    for inf in infinitives:
        if inf[0].text.lower() != 'to':
            continue
        if inf[-1].tag_ != 'VB':
            continue
        return 'SplitInfinitiveWarning'
예제 #23
0
def get_verb_chunks(sent):
    verb_chunks = []
    pattern = r'<VERB>?<ADV>*<VERB>+'
    doct = textacy.Doc(sent, lang='en_core_web_sm')
    lists = textacy.extract.pos_regex_matches(doct, pattern)
    for list in lists:
        verb_chunks.append(list.text)
    # print(list.text)

    print(verb_chunks)

    return verb_chunks
def find_terms(content: str,
               maxterms=MAX_SUGGESTED_KEYTERMS,
               ngrams_to_extract=(1, 2, 3)) -> list:
    '''
    Find NGRAMS of significance in passed text
    will look for ngrams up to 3 words by default.
    Returns a list of (term, frequency) tuples sorted by frequency
    '''
    import textacy
    import textacy.keyterms
    import io
    from spacy.lang.en.stop_words import STOP_WORDS

    # if we used standard in, read the io.TextIOWrapper class
    # otherwise just accept as is
    if type(content) == io.TextIOWrapper:
        text = content.read()
    else:
        # its asumed to be str
        text = content

    LOG.debug(f'''content has %d chars''', len(text))

    # find tags here
    tags = []

    #lang_en = spacy.util.get_lang_class('en')
    doc = textacy.Doc(text)
    LOG.debug(doc)

    # extract keyterms to make suggestions for new ADASS subject terms
    keyterms = textacy.keyterms.textrank(doc,
                                         normalize='lemma',
                                         n_keyterms=maxterms)
    LOG.debug(f'''KEYTERMS: %s''', keyterms)

    # We'll use the Bag of terms, ngrams by frequency, to find relevant matches with
    # existing terms in the ADASS dictionary
    bot = doc.to_bag_of_terms(ngrams=ngrams_to_extract,
                              lemmatize=True,
                              named_entities=True,
                              weighting='count',
                              as_strings=True)

    # For some reason we see stopwords in the BoT, so make another pass to clean out stopwords
    # and the empty string then print top 15 number of terms by occurance
    cleaned_bot = [(term, cnt) for term, cnt in bot.items()
                   if term not in STOP_WORDS and term != '']
    sorted_cleaned_bot = sorted(cleaned_bot, key=lambda x: x[1], reverse=True)
    LOG.debug(f'''BAG of Terms (top, cleaned): %s''',
              sorted_cleaned_bot[:MAX_BOT_TERMS])

    return {'ngrams': sorted_cleaned_bot, 'keyterms': keyterms}
예제 #25
0
    def extract_keyphrases(self, algorithm, **kwargs):
        """ Method for extracting keyphrases from text
            algorithm takes 'str' object -> get function using eval
                           'func' object
            **kwargs: parameters for algorithm
        """

        if isinstance(algorithm, str):
            algorithm = eval('tkt.{}'.format(algorithm))

        doc = textacy.Doc(self.text, lang='en')
        self.keyphrases = list(algorithm(doc, **kwargs))
def drop_modifiers(sentence_str):
    """Given a string, drop the modifiers and return a string 
    without them"""
    tdoc = textacy.Doc(sentence_str, lang='en_core_web_lg')
    new_sent = tdoc.text
    unusual_char = '形'
    for tag in tdoc:
        if tag.dep_.endswith('mod'):
            # Replace the tag
            new_sent = new_sent[:tag.idx] + unusual_char * len(tag.text) +\
                    new_sent[tag.idx + len(tag.text):]
    new_sent = new_sent.replace(unusual_char, '')
    new_sent = textacy.preprocess.normalize_whitespace(new_sent)
    return new_sent
def remove_prepositional_phrases(sentence_str):
    sentence_doc = textacy.Doc(sentence_str, lang='en_core_web_lg')
    # possessive pronouns labled as ADJ
    pp_pattern = r'<ADP><ADJ|DET>?(<NOUN>+<ADP>)*<NOUN>+'
    prep_phrases = textacy.extract.pos_regex_matches(sentence_doc, pp_pattern)
    new_sent_str = sentence_str
    unusual_char = '形'
    for pp in prep_phrases:
        sub = unusual_char * len(pp.text)
        new_sent_str = new_sent_str[:pp[0].
                                    idx] + sub + new_sent_str[pp[0].idx +
                                                              len(pp.text):]
    new_sent_str = new_sent_str.replace(unusual_char, '')
    return new_sent_str
    def show_plot(self, pos, neg, plot_type='semantic'):
        for group in [('Positive', pos), ('Negative', neg)]:
            name, group = group
            if plot_type == 'semantic':
                # if name == 'Positive' and group.shape[0] > 150:
                #     group = group.sample(155)
                corpus = [self.prep_tokens(tweet) for tweet in group]
                corpus = ' '.join(word for word in corpus)
                cleaned_text = textacy.preprocess_text(corpus,
                                                       fix_unicode=True,
                                                       no_accents=True)
                doc = textacy.Doc(cleaned_text, lang='en')
                graph = doc.to_semantic_network(nodes='words',
                                                edge_weighting='cooc_freq',
                                                window_width=10)
                drop_nodes = ['pron']
                for node in drop_nodes:
                    try:
                        graph.remove_node(node)
                    except:
                        pass
                node_weights = nx.pagerank_scipy(graph)
                ax = textacy.viz.network.draw_semantic_network(
                    graph, node_weights=node_weights, spread=50.0)
                plt.suptitle(name + ' Sentiment Topics:' +
                             '\n{} {} tweets\n{}'.format(
                                 group.shape[0], name, self.hashtag))
                # plt.savefig('../images/plots/' + name)
            else:
                corpus = [self.prep_tokens(tweet) for tweet in group]
                tf = TfidfVectorizer().fit(corpus)

                doc_term_matrix = tf.transform(corpus)
                vocab = tf.get_feature_names()
                vocab = [word for word in vocab if word != 'pron']

                model = textacy.tm.TopicModel('nmf', n_topics=3)
                model.fit(doc_term_matrix)
                model.termite_plot(doc_term_matrix,
                                   vocab,
                                   topics=-1,
                                   n_terms=25,
                                   sort_terms_by='seriation',
                                   rank_terms_by='topic_weight',
                                   highlight_topics=range(3))
                plt.suptitle(name + ' Sentiment Topics:')
                # plt.savefig('semantic_plot')

        plt.show(block=False)
예제 #29
0
def generate_corpus(comment_json):
    my_corpus = []
    with open(comment_json) as input_file:
        article_comments = json.load(input_file)

    comments = list(article_comments.values())[0]['comments']
    for cmt in comments:
        md = {
            'comment_author': cmt['comment_author'],
            'comment_time': cmt['comment_time']
        }
        doc = textacy.Doc(cmt['comment'], metadata=md)
        my_corpus.append(doc)

    return my_corpus
예제 #30
0
def remove_prepositional_phrases(sentence_str):
    """Given a string, drop the prepositional phrases and return a new string
    without them"""
    sentence_doc = textacy.Doc(sentence_str, lang='en_core_web_lg')
    pp_pattern = r'<ADP><ADJ|DET>?(<NOUN>+<ADP>)*<NOUN>+'
    prep_phrases = textacy.extract.pos_regex_matches(sentence_doc, pp_pattern)
    new_sent_str = sentence_str
    unusual_char = '形'
    for pp in prep_phrases:
        sub = unusual_char * len(pp.text)
        new_sent_str = new_sent_str[:pp[0].
                                    idx] + sub + new_sent_str[pp[0].idx +
                                                              len(pp.text):]
    new_sent_str = new_sent_str.replace(unusual_char, '')
    return new_sent_str