Пример #1
0
    def wordTuples(graph, textEntry):
        text = rootify(graph, textEntry)
        pt = textacy.load_spacy('pt')
        doc = textacy.Doc(text, lang=pt)
        ts = textacy.TextStats(doc)
        words = [{
            w[0]: w[1]
        } for w in textacy.keyterms.textrank(
            doc, normalize='lower', n_keyterms=ts.n_unique_words)]

        return words
Пример #2
0
 def __init__(self):
     self._content = ""
     self._min_occurrence_for_topic = 2
     self._common_verbs = 10
     # create an empty corpus
     en = textacy.load_spacy('en_core_web_sm', disable=('parser',))
     self._corpus = textacy.Corpus(lang=en)
     self._content = None
     self._model = None
     self._numdocs = 0
     self._numtopics = 0
     self._terms = None
     self._doc_term_matrix = None
     self._doc_topic_matrix = None
     self._vectorizer = Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth',
                                   norm='l2', min_df=3, max_df=0.95, max_n_terms=100000)
def setup_nlp_language_model(language, **nlp_args):

    if (len(nlp_args.get('disable', [])) == 0):
        nlp_args.pop('disable')

    def remove_whitespace_entities(doc):
        doc.ents = [e for e in doc.ents if not e.text.isspace()]
        return doc

    logger.info('Loading model: %s...', language)

    Language.factories[
        'remove_whitespace_entities'] = lambda nlp, **cfg: remove_whitespace_entities
    model_name = LANGUAGE_MODEL_MAP[language]
    #if not model_name.endswith('lg'):
    #    logger.warning('Selected model is not the largest availiable.')
    nlp = textacy.load_spacy(model_name, **nlp_args)
    nlp.tokenizer = keep_hyphen_tokenizer(nlp)

    pipeline = lambda: [x[0] for x in nlp.pipeline]

    logger.info('Using pipeline: ' + ' '.join(pipeline()))

    return nlp
Пример #4
0
# coding: utf-8

# In[65]:

from cassandra.cluster import Cluster
import spacy
import pandas as pd
import operator
from datetime import datetime
import textacy
nlp = spacy.load('en_core_web_lg')
cluster = Cluster(['127.0.0.1'])
session = cluster.connect('sharelock')
en = textacy.load_spacy('en_core_web_lg')

# Clean text rows, lemmatize, remove stop words.
#

# In[66]:

from emoji import UNICODE_EMOJI


def is_emoji(s):
    count = 0
    for emoji in UNICODE_EMOJI:
        count += s.count(emoji)
        if count > 1:
            return False
    return bool(count)
# pip install textblob
# python -m textblob.download_corpora
# pip install spacy textacy
# python -m spacy download en

import json
from collections import defaultdict, Counter
import re
import textacy
from textblob import TextBlob

en = textacy.load_spacy("en_core_web_sm")
# my_stop_words = {"the", "to"}
my_stop_words = en.Defaults.stop_words - {"not"}

data = []

with open("only_cellphones_brands.json") as json_data:
    data = json.load(json_data)


# re-implemented from https://github.com/chartbeat-labs/textacy/blob/26abafae7d745614b68d2f90310741b5a8de24d7/textacy/extract.py#L327
# to ignore stop words in matching
def pos_regex_matches(doc, pattern):
    # standardize and transform the regular expression pattern...
    pattern = re.sub(r"\s", "", pattern)
    pattern = re.sub(r"<([A-Z]+)\|([A-Z]+)>", r"( (\1|\2))", pattern)
    pattern = re.sub(r"<([A-Z]+)>", r"( \1)", pattern)

    filtered_tokens = [
        tok for tok in doc if tok.text not in my_stop_words and tok.is_alpha
Пример #6
0
import pickle as pkl
import re

import numpy as np
import pandas as pd

from gensim.models.phrases import Phrases, Phraser

import textacy
from textacy import preprocess_text, Doc, Corpus
from textacy.vsm import Vectorizer, GroupVectorizer
from textacy.tm import TopicModel
en = textacy.load_spacy("en_core_web_sm", disable='parser')

data = pd.read_csv('data/qaData.csv', parse_dates=['Date'])
ec_data = data.loc[data['EventType'] == "Earnings call", [
    'Date', 'Company', 'Participants', 'AnalystName', 'AnalystCompany',
    'EventName', 'EarningTag2', "Question"
]].copy()
ec_data['Quarter'] = ec_data['EventName'].str.split("Q").str[0]
ec_data = ec_data.groupby(
    ['Date', "Company", "Participants", "EventName",
     "Quarter"]).apply(lambda x: x.reset_index()).reset_index(drop=True)
ec_data.columns = [
    "QuestionOrder", "Date", "Company", "Participants", "AnalystName",
    "AnalystCompany", "EventName", "Tag", "Question", "Quarter"
]
ec_data = ec_data[[
    "Date", "Quarter", "Company", "Participants", "AnalystCompany",
    "AnalystName", "QuestionOrder", "Tag", "Question"
]]
Пример #7
0
import spacy
import textacy
from spacy import displacy, tokens
from textacy import extract

# This is common global object which is used for computing similarity
# Some private members so as to make it work
nlp = spacy.load('en_core_web_sm')
lang_en = textacy.load_spacy('en_core_web_sm')
possible_docs = {'textacy', 'spacy'}
subjects = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
objects = ["dobj", "dative", "attr", "oprd"]
conjunctions = ["or", "and"]


def render_pos_html(list_of_docs):
    return displacy.render(map(lambda x: get_spacy_doc(x), list_of_docs),
                           style='dep',
                           page=True)


def get_new_doc(phrase, doc_type='textacy'):
    assert isinstance(phrase, basestring)
    assert isinstance(doc_type, str)
    assert doc_type in possible_docs, "Only {} doc types are supported".format(
        possible_docs)

    if doc_type == 'textacy':
        return textacy.Doc(phrase, lang=lang_en)
    elif doc_type == 'spacy':
        return nlp(phrase)