def __init__(self, text_or_sdoc, spacy_pipeline=None, lang=None, metadata=None): self.metadata = {} if metadata is None else metadata self._term_counts = Counter() if isinstance(text_or_sdoc, str): self.lang = text_utils.detect_language(text_or_sdoc) if not lang else lang if spacy_pipeline is None: spacy_pipeline = data.load_spacy(self.lang) # check for match between text and passed spacy_pipeline language else: if spacy_pipeline.lang != self.lang: msg = 'TextDoc.lang {} != spacy_pipeline.lang {}'.format( self.lang, spacy_pipeline.lang) raise ValueError(msg) self.spacy_vocab = spacy_pipeline.vocab self.spacy_stringstore = self.spacy_vocab.strings self.spacy_doc = spacy_pipeline(text_or_sdoc) elif isinstance(text_or_sdoc, sdoc): self.lang = spacy_pipeline.lang if spacy_pipeline is not None else \ text_utils.detect_language(text_or_sdoc.text_with_ws) self.spacy_vocab = text_or_sdoc.vocab self.spacy_stringstore = self.spacy_vocab.strings self.spacy_doc = text_or_sdoc else: msg = 'TextDoc must be initialized with {}, not {}'.format( {str, sdoc}, type(text_or_sdoc)) raise ValueError(msg)
def __init__(self, text_or_sdoc, spacy_pipeline=None, lang=None, metadata=None): self.metadata = {} if metadata is None else metadata self._term_counts = Counter() if isinstance(text_or_sdoc, string_types): self.lang = text_utils.detect_language( text_or_sdoc) if not lang else lang if spacy_pipeline is None: spacy_pipeline = data.load_spacy(self.lang) # check for match between text and passed spacy_pipeline language else: if spacy_pipeline.lang != self.lang: msg = 'TextDoc.lang {} != spacy_pipeline.lang {}'.format( self.lang, spacy_pipeline.lang) raise ValueError(msg) self.spacy_vocab = spacy_pipeline.vocab self.spacy_stringstore = self.spacy_vocab.strings self.spacy_doc = spacy_pipeline(text_or_sdoc) elif isinstance(text_or_sdoc, sdoc): self.lang = spacy_pipeline.lang if spacy_pipeline is not None else \ text_utils.detect_language(text_or_sdoc.text_with_ws) self.spacy_vocab = text_or_sdoc.vocab self.spacy_stringstore = self.spacy_vocab.strings self.spacy_doc = text_or_sdoc else: msg = 'TextDoc must be initialized with {}, not {}'.format( {str, sdoc}, type(text_or_sdoc)) raise ValueError(msg)
def extract_responses(filepath, writer): with open(filepath) as input_file: reader = csv.reader(input_file, quoting=csv.QUOTE_MINIMAL) deleted = "deleted" for line in reader: if (deleted not in line[0]) and (deleted not in line[1]): preprocessed_line = preprocess(line[1]) try: if detect_language(preprocessed_line) == 'en': writer.writerow([preprocessed_line]) except ValueError: continue
def __init__(self, content, metadata=None, lang=None): self.metadata = metadata or {} # Doc instantiated from text, so must be parsed with a spacy.Language if isinstance(content, unicode_type): if isinstance(lang, SpacyLang): self.lang = lang.lang spacy_lang = lang elif isinstance(lang, unicode_type): self.lang = lang spacy_lang = data.load_spacy(self.lang) elif lang is None: self.lang = text_utils.detect_language(content) spacy_lang = data.load_spacy(self.lang) else: msg = '`lang` must be {}, not "{}"'.format( {unicode_type, SpacyLang}, type(lang)) raise ValueError(msg) self.spacy_vocab = spacy_lang.vocab self.spacy_stringstore = self.spacy_vocab.strings self.spacy_doc = spacy_lang(content) # Doc instantiated from an already-parsed spacy.Doc elif isinstance(content, SpacyDoc): self.spacy_vocab = content.vocab self.spacy_stringstore = self.spacy_vocab.strings self.spacy_doc = content self.lang = self.spacy_vocab.lang # these checks are probably unnecessary, but in case a user # has done something very strange, we should complain... if isinstance(lang, SpacyLang): if self.spacy_vocab is not lang.vocab: msg = '`spacy.Vocab` used to parse `content` must be the same as the one associated with `lang`' raise ValueError(msg) elif isinstance(lang, unicode_type): if lang != self.lang: raise ValueError( 'lang of spacy models used to parse `content` must be the same as `lang`' ) elif lang is not None: msg = '`lang` must be {}, not "{}"'.format( {unicode_type, SpacyLang}, type(lang)) raise ValueError(msg) # oops, user has made some sort of mistake else: msg = '`Doc` must be initialized with {}, not "{}"'.format( {unicode_type, SpacyDoc}, type(content)) raise ValueError(msg)
def __init__(self, content, metadata=None, lang=None): self.metadata = metadata or {} # Doc instantiated from text, so must be parsed with a spacy.Language if isinstance(content, unicode_type): if isinstance(lang, SpacyLang): self.lang = lang.lang spacy_lang = lang elif isinstance(lang, unicode_type): self.lang = lang spacy_lang = data.load_spacy(self.lang) elif lang is None: self.lang = text_utils.detect_language(content) spacy_lang = data.load_spacy(self.lang) else: msg = '`lang` must be {}, not "{}"'.format( {unicode_type, SpacyLang}, type(lang)) raise ValueError(msg) self.spacy_vocab = spacy_lang.vocab self.spacy_stringstore = self.spacy_vocab.strings self.spacy_doc = spacy_lang(content) # Doc instantiated from an already-parsed spacy.Doc elif isinstance(content, SpacyDoc): self.spacy_vocab = content.vocab self.spacy_stringstore = self.spacy_vocab.strings self.spacy_doc = content self.lang = self.spacy_vocab.lang # these checks are probably unnecessary, but in case a user # has done something very strange, we should complain... if isinstance(lang, SpacyLang): if self.spacy_vocab is not lang.vocab: msg = '`spacy.Vocab` used to parse `content` must be the same as the one associated with `lang`' raise ValueError(msg) elif isinstance(lang, unicode_type): if lang != self.lang: raise ValueError('lang of spacy models used to parse `content` must be the same as `lang`') elif lang is not None: msg = '`lang` must be {}, not "{}"'.format( {unicode_type, SpacyLang}, type(lang)) raise ValueError(msg) # oops, user has made some sort of mistake else: msg = '`Doc` must be initialized with {}, not "{}"'.format( {unicode_type, SpacyDoc}, type(content)) raise ValueError(msg)
def __init__(self, text, spacy_pipeline=None, lang='auto', metadata=None, max_cachesize=5): self.metadata = {} if metadata is None else metadata self.lang = text_utils.detect_language(text) if lang == 'auto' else lang if spacy_pipeline is None: self.spacy_pipeline = data.load_spacy_pipeline(lang=self.lang) else: # check for match between text and supplied spacy pipeline language if spacy_pipeline.lang != self.lang: msg = 'TextDoc.lang {} != spacy_pipeline.lang {}'.format( self.lang, spacy_pipeline.lang) raise ValueError(msg) else: self.spacy_pipeline = spacy_pipeline self.spacy_vocab = self.spacy_pipeline.vocab self.spacy_stringstore = self.spacy_vocab.strings self.spacy_doc = self.spacy_pipeline(text) self._term_counts = Counter() self._cache = LRUCache(maxsize=max_cachesize)
from src.utils import preprocess if __name__ == '__main__': EMOTION_DATAPATH = 'data/processed/emotions_full.csv' FASTTEXT_FULL_FILE = 'data/processed/fasttext_full.txt' MODEL_PATH = 'models/emotion_classification/fasttext/model' label_prefix = '__label__' texts = [] labels = [] with open(EMOTION_DATAPATH) as data_file: reader = csv.reader(data_file, quoting=csv.QUOTE_MINIMAL) reader.__next__() for i, line in enumerate(reader): preprocessed_line = preprocess(line[1]) if detect_language(preprocessed_line) == 'en': doc = textacy.Doc(preprocessed_line, lang='en_core_web_lg') texts.append(doc) labels.append(line[2]) with open(FASTTEXT_FULL_FILE, 'w') as input_file: for x, y in zip(texts, labels): input_file.write(' , '.join( [label_prefix + str(y), x.text.replace('\n', '')]) + '\n') # Hypertuned by fasttext_hypertuning.py dim = 300 lr = 0.1 epoch = 10 word_ngrams = 1
def test_detect_language(self): for lang, sent in LANG_SENTS: self.assertEqual(text_utils.detect_language(sent), lang)
def test_detect_language(): for lang, sent in LANG_SENTS: assert text_utils.detect_language(sent) == lang