def __init__(self, lang="en"): super().__init__() self.package_check(lang) self.load_macros(lang) self.load_patterns(lang) if not Doc.has_extension('arguments'): Doc.set_extension('arguments', getter=ArgumentTexts(self)) else: default, method, getter, setter = Doc.get_extension('arguments') assert isinstance(getter, ArgumentTexts), \ "Expected 'arguments' extension to be of type ArgumentTexts " \ "but found {}. Namespace clash?".format(type(Doc.get_extension('arguments')))
def load(path): """Load the data, spaCy model, and documents. Parameters ---------- path : str Path to Steam review data CSV. Returns ------- pd.DataFrame Review data. spacy.lang.en.English Language object with extra pipeline components. List[spacy.tokens.Doc] Documents passed through the NLP pipeline. """ steam_rev = pd.read_csv(path, low_memory=False) normalize_words(steam_rev) # Steam reviews can be voted funny or given a thumbs up. # So...let's use these as our classes! steam_rev["up_funny"] = steam_rev.votes_up > steam_rev.votes_funny # Start with a pretrained model on blogs (maybe this is a bad idea?) nlp = spacy.load("en_core_web_md") # Store the title of each game and whether the review is positive/neg if not Doc.get_extension("is_recommended"): Doc.set_extension("is_recommended", default=np.nan) if not Doc.get_extension("title"): Doc.set_extension("title", default=np.nan) # Add extra pipeline components add_titles_ent_pipe(steam_rev, nlp) # Can't get this to work. # nlp.add_pipe("add_title", # after="entity_ruler", # config={"titles_iter": steam_rev.title.iteritems()}) # Reviews are short so I'm using a large batch size. return steam_rev, nlp, np.array(list(nlp.pipe(steam_rev.user_review, cleanup=True, batch_size=256, n_process=-1)), dtype=Doc)