def fetch_bernie_and_hillary(data_dir=None, download_if_missing=True, shuffle=False): """ Load the Bernie & Hillary corpus from disk (automatically downloading data from S3 if necessary and desired). Args: data_dir (str): path on disk from which corpus will be loaded; if None, textacy's default data_dir is used (optional) download_if_missing (bool): if True and corpus not found on disk, it will be automatically downloaded from S3 and saved to disk (optional) shuffle (bool): if True, randomly shuffle order of documents; if False, documents are sorted chronologically (optional) Returns: list[dict]: each item in list corresponds to a speech document Raises: IOError: if file is not found on disk and `download_if_missing` is False HTTPError: if file is not found on disk, `download_if_missing` is True, and something goes wrong with the download .. warn:: The Bernie & Hillary corpus has been deprecated! Use the newer and more comprehensive CapitolWords corpus instead. To recreate B&H, filter CapitolWords speeches by `speaker_name={'Bernie Sanders', 'Hillary Clinton'}`. """ with warnings.catch_warnings(): warnings.simplefilter('always', DeprecationWarning) msg = """ The Bernie & Hillary corpus has been deprecated! Use the newer and more comprehensive CapitolWords corpus instead. To recreate B&H, filter CapitolWords speeches by `speaker_name={'Bernie Sanders', 'Hillary Clinton'}`. """ warnings.warn(msg.strip(), DeprecationWarning) if data_dir is None: data_dir = DEFAULT_DATA_DIR fname = os.path.join(data_dir, FNAME) try: data = list(read_json_lines(fname, mode='rt', encoding=None)) except (OSError, IOError): if download_if_missing is True: _download_bernie_and_hillary(data_dir=data_dir) data = list(read_json_lines(fname, mode='rt', encoding=None)) else: logger.exception('unable to load corpus from %s', fname) raise logger.info('loading corpus from %s', fname) if shuffle is True: random.shuffle(data) return data
def _iterate(self, text_only, subreddit, date_range, score_range, min_len, limit): """ Low-level method to iterate over the records in this dataset. Used by :meth:`RedditComments.texts()` and :meth:`RedditComments.records()`. """ if subreddit: if isinstance(subreddit, compat.string_types): subreddit = {subreddit} elif isinstance(subreddit, (list, tuple)): subreddit = set(subreddit) if score_range: score_range = self._parse_score_range(score_range) if date_range: date_range = self._parse_date_range(date_range) needed_filenames = { os.path.join(self.data_dir, fstub) for fstub in self._generate_filestubs(date_range) } filenames = tuple(fname for fname in self.filenames if fname in needed_filenames) else: filenames = self.filenames if not filenames: raise IOError( 'No files found at {} corresponding to date range {}'.format( self.data_dir, date_range)) n = 0 for filename in filenames: for line in fileio.read_json_lines(filename, mode='rb'): if subreddit and line['subreddit'] not in subreddit: continue if score_range and not score_range[0] <= line[ 'score'] < score_range[1]: continue line['created_utc'] = self._convert_timestamp( line.get('created_utc', '')) if date_range and not date_range[0] <= line[ 'created_utc'] < date_range[1]: continue line['body'] = self._clean_content(line['body']) if min_len and len(line['body']) < min_len: continue if text_only is True: yield line['body'] else: line['retrieved_on'] = self._convert_timestamp( line.get('retrieved_on', '')) yield line n += 1 if n == limit: break if n == limit: break
def test_read_write_json_lines(self): expected = [{'idx': i, 'sent': sent.text} for i, sent in enumerate(self.spacy_doc.sents)] filename = os.path.join(self.tempdir, 'test_read_write_json_lines.json') fileio.write_json_lines(expected, filename) observed = list(fileio.read_json_lines(filename)) self.assertEqual(observed, expected)
def _iterate(self, text_only, subreddit, date_range, score_range, min_len, limit): """Note: Use `.texts()` or `.records()` to iterate over corpus data.""" if subreddit: if isinstance(subreddit, string_types): subreddit = {subreddit} elif isinstance(subreddit, (list, tuple)): subreddit = set(subreddit) if date_range: if not isinstance(date_range, (list, tuple)): msg = "`date_range` must be a list or tuple, not {}".format(type(date_range)) raise ValueError(msg) if not len(date_range) == 2: msg = "`date_range` must have both start and end values" raise ValueError(msg) if not date_range[0]: date_range = (MIN_DATE, date_range[1]) if not date_range[1]: date_range = (date_range[0], MAX_DATE) if score_range: if not isinstance(score_range, (list, tuple)): msg = "`score_range` must be a list or tuple, not {}".format(type(score_range)) raise ValueError(msg) if len(score_range) != 2: msg = "`score_range` must have both min and max values" raise ValueError(msg) if not score_range[0]: score_range = (MIN_INT, score_range[1]) if not score_range[1]: score_range = (score_range[0], MAX_INT) n = 0 mode = "rb" if PY2 else "rt" # Python 2 can't open json in text mode for path in self.paths: for line in read_json_lines(path, mode=mode): if subreddit and line["subreddit"] not in subreddit: continue if score_range and not score_range[0] <= line["score"] <= score_range[1]: continue line["created_utc"] = self._convert_timestamp(line.get("created_utc", "")) if date_range and not date_range[0] <= line["created_utc"] <= date_range[1]: continue line["body"] = self._clean_content(line["body"]) if min_len and len(line["body"]) < min_len: continue if text_only is True: yield line["body"] else: line["retrieved_on"] = self._convert_timestamp(line.get("retrieved_on", "")) yield line n += 1 if n == limit: break if n == limit: break
def load(cls, path, fname_prefix=None, compression=None): """ Load serialized content and metadata from disk, and initialize a TextCorpus. Args: path (str): directory on disk where content + metadata are saved fname_prefix (str, optional): additional identifying information prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json' when saving to disk compression ({'gzip', 'bz2', 'lzma'} or None): type of compression used to reduce size of metadatas json file Returns: :class:`textacy.TextCorpus` .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join( path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join( path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'rt' if PY2 is False or compression is None else 'rb' package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this TextCorpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded TextCorpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) textcorpus = TextCorpus(lang) metadata_stream = fileio.read_json_lines( meta_fname, mode=meta_mode, ) spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): textcorpus.add_doc( TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline, lang=lang, metadata=metadata)) return textcorpus
def _iterate(self, text_only, opinion_author=None, decision_direction=None, issue_area=None, date_range=None, min_len=None, limit=-1): """Note: Use `.texts()` or `.records()` to iterate over corpus data.""" # prepare filters if opinion_author: if isinstance(opinion_author, int): opinion_author = {opinion_author} if not all(oa in self.opinion_author_codes for oa in opinion_author): msg = 'invalid `opinion_author` value; see `SupremeCourt.opinion_author_codes`' raise ValueError(msg) if issue_area: if isinstance(issue_area, int): issue_area = {issue_area} if not all(ii in self.issue_area_codes for ii in issue_area): msg = 'invalid `issue_area` value; see `SupremeCourt.issue_area_codes`' raise ValueError(msg) if decision_direction: if isinstance(decision_direction, string_types): decision_direction = {decision_direction} if not all(dd in self.decision_directions for dd in decision_direction): msg = 'invalid `decision_direction` value; see `SupremeCourt.decision_directions`' raise ValueError(msg) if date_range: if not isinstance(date_range, (list, tuple)): msg = '`date_range` must be a list or tuple, not {}'.format(type(date_range)) raise ValueError(msg) if not len(date_range) == 2: msg = '`date_range` must have both start and end values' raise ValueError(msg) if not date_range[0]: date_range = (MIN_DATE, date_range[1]) if not date_range[1]: date_range = (date_range[0], MAX_DATE) n = 0 mode = 'rb' if PY2 else 'rt' for line in read_json_lines(self.filepath, mode=mode): if opinion_author and line['maj_opinion_author'] not in opinion_author: continue if issue_area and line['issue_area'] not in issue_area: continue if decision_direction and line['decision_direction'] not in decision_direction: continue if date_range and not date_range[0] <= line['decision_date'] <= date_range[1]: continue if min_len and len(line['text']) < min_len: continue if text_only is True: yield line['text'] else: yield line n += 1 if n == limit: break
def _iterate(self, text_only, opinion_author=None, decision_direction=None, issue_area=None, date_range=None, min_len=None, limit=-1): """Note: Use `.texts()` or `.records()` to iterate over corpus data.""" # prepare filters if opinion_author: if isinstance(opinion_author, int): opinion_author = {opinion_author} if not all(oa in self.opinion_author_codes for oa in opinion_author): msg = 'invalid `opinion_author` value; see `SupremeCourt.opinion_author_codes`' raise ValueError(msg) if issue_area: if isinstance(issue_area, int): issue_area = {issue_area} if not all(ii in self.issue_area_codes for ii in issue_area): msg = 'invalid `issue_area` value; see `SupremeCourt.issue_area_codes`' raise ValueError(msg) if decision_direction: if isinstance(decision_direction, string_types): decision_direction = {decision_direction} if not all(dd in self.decision_directions for dd in decision_direction): msg = 'invalid `decision_direction` value; see `SupremeCourt.decision_directions`' raise ValueError(msg) if date_range: if not isinstance(date_range, (list, tuple)): msg = '`date_range` must be a list or tuple, not {}'.format(type(date_range)) raise ValueError(msg) if not len(date_range) == 2: msg = '`date_range` must have both start and end values' raise ValueError(msg) if not date_range[0]: date_range = (MIN_DATE, date_range[1]) if not date_range[1]: date_range = (date_range[0], MAX_DATE) n = 0 mode = 'rb' if is_python2 else 'rt' for line in read_json_lines(self.filepath, mode=mode): if opinion_author and line['maj_opinion_author'] not in opinion_author: continue if issue_area and line['issue_area'] not in issue_area: continue if decision_direction and line['decision_direction'] not in decision_direction: continue if date_range and not date_range[0] <= line['decision_date'] <= date_range[1]: continue if min_len and len(line['text']) < min_len: continue if text_only is True: yield line['text'] else: yield line n += 1 if n == limit: break
def test_read_write_json_lines(self): expected = [{ 'idx': i, 'sent': sent.text } for i, sent in enumerate(self.spacy_doc.sents)] filename = os.path.join(self.tempdir, 'test_read_write_json_lines.json') fileio.write_json_lines(expected, filename) observed = list(fileio.read_json_lines(filename)) self.assertEqual(observed, expected)
def fetch_bernie_and_hillary(data_dir=None, download_if_missing=True, shuffle=False): """ Load the Bernie & Hillary corpus from disk (automatically downloading data from S3 if necessary and desired). Args: data_dir (str): path on disk from which corpus will be loaded; if None, textacy's default data_dir is used (optional) download_if_missing (bool): if True and corpus not found on disk, it will be automatically downloaded from S3 and saved to disk (optional) shuffle (bool): if True, randomly shuffle order of documents; if False, documents are sorted chronologically (optional) Returns: list[dict]: each item in list corresponds to a speech document Raises: IOError: if file is not found on disk and `download_if_missing` is False HTTPError: if file is not found on disk, `download_if_missing` is True, and something goes wrong with the download """ if data_dir is None: data_dir = DEFAULT_DATA_DIR fname = os.path.join(data_dir, FNAME) try: data = list(read_json_lines(fname, mode='rt', encoding=None)) except (OSError, IOError): if download_if_missing is True: _download_bernie_and_hillary(data_dir=data_dir) data = list(read_json_lines(fname, mode='rt', encoding=None)) else: logger.exception('unable to load corpus from %s', fname) raise logger.info('loading corpus from %s', fname) if shuffle is True: random.shuffle(data) return data
def load(cls, path, name=None, compression=None): """ Load content and metadata from disk, and initialize a ``Corpus``. Args: path (str): Directory on disk where content + metadata are saved. name (str): Identifying/uniquifying name prepended to the default filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json', used when corpus was saved to disk via :meth:`Corpus.save()`. compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression used to reduce size of 'metadatas.json' file when saved, if any. Returns: :class:`textacy.Corpus <Corpus>` .. warning:: If the ``spacy.Vocab`` object used to save this document is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if name: info_fname = os.path.join(path, '_'.join([name, 'info.json'])) meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'rt' if PY2 is False or compression is None else 'rb' package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this Corpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded Corpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) corpus = Corpus(lang) metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode) spacy_docs = fileio.read_spacy_docs(corpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): corpus.add_doc( Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) return corpus
def load(cls, path, fname_prefix=None, compression=None): """ Load serialized content and metadata from disk, and initialize a TextCorpus. Args: path (str): directory on disk where content + metadata are saved fname_prefix (str, optional): additional identifying information prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json' when saving to disk compression ({'gzip', 'bz2', 'lzma'} or None): type of compression used to reduce size of metadatas json file Returns: :class:`textacy.TextCorpus` .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'rt' if PY2 is False or compression is None else 'rb' package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this TextCorpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded TextCorpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) textcorpus = TextCorpus(lang) metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode,) spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): textcorpus.add_doc( TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline, lang=lang, metadata=metadata)) return textcorpus
def test_read_write_json_lines_unicode(self): expected = [{'idx': i, 'sent': sent.text} for i, sent in enumerate(self.spacy_doc.sents)] for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'): filename = os.path.join( self.tempdir, 'test_read_write_json_lines_unicode' + ext) if PY2 is True and ext != '.json': self.assertRaises( ValueError, fileio.open_sesame, filename, 'wt', None, True) else: fileio.write_json_lines(expected, filename, mode='wt', auto_make_dirs=True) observed = list(fileio.read_json_lines(filename, mode='rt')) self.assertEqual(observed, expected)
def load(cls, path, fname_prefix=None): """ Load serialized content and metadata from disk, and initialize a TextCorpus. Args: path (str): directory on disk where content + metadata are saved fname_prefix (str, optional): additional identifying information prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json' when saving to disk Returns: :class:`textacy.TextCorpus` """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this TextCorpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded TextCorpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) textcorpus = TextCorpus(lang) metadata_stream = fileio.read_json_lines(meta_fname) spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): textcorpus.add_doc( TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline, lang=lang, metadata=metadata)) return textcorpus
def test_read_write_json_lines_bytes(self): expected = [{ 'idx': i, 'sent': sent.text } for i, sent in enumerate(self.spacy_doc.sents)] for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'): filename = os.path.join(self.tempdir, 'test_read_write_json_lines_bytes' + ext) if is_python2 is True: if ext == '.json.xz': self.assertRaises(ValueError, fileio.open_sesame, filename, 'wb', 'utf-8', True) else: fileio.write_json_lines(expected, filename, mode='wb', auto_make_dirs=True) observed = list(fileio.read_json_lines(filename, mode='rb')) self.assertEqual(observed, expected) else: self.assertRaises(TypeError, fileio.write_json_lines, expected, filename, 'wb', None, True)
def _iterate(self, text_only, opinion_author=None, decision_direction=None, issue_area=None, date_range=None, min_len=None, limit=-1): """ Low-level method to iterate over the records in this dataset. Used by :meth:`SupremeCourt.texts()` and :meth:`SupremeCourt.records()`. """ if not self.filename: raise IOError('{} file not found'.format(self._filename)) if opinion_author: if isinstance(opinion_author, int): opinion_author = {opinion_author} if not all(oa in self.opinion_author_codes for oa in opinion_author): msg = 'invalid `opinion_author` value; see `SupremeCourt.opinion_author_codes`' raise ValueError(msg) if issue_area: if isinstance(issue_area, int): issue_area = {issue_area} if not all(ii in self.issue_area_codes for ii in issue_area): msg = 'invalid `issue_area` value; see `SupremeCourt.issue_area_codes`' raise ValueError(msg) if decision_direction: if isinstance(decision_direction, compat.string_types): decision_direction = {decision_direction} if not all(dd in self.decision_directions for dd in decision_direction): msg = 'invalid `decision_direction` value; see `SupremeCourt.decision_directions`' raise ValueError(msg) if date_range: date_range = self._parse_date_range(date_range) n = 0 mode = 'rb' if compat.is_python2 else 'rt' for line in fileio.read_json_lines(self.filename, mode=mode): if opinion_author and line[ 'maj_opinion_author'] not in opinion_author: continue if issue_area and line['issue_area'] not in issue_area: continue if decision_direction and line[ 'decision_direction'] not in decision_direction: continue if date_range and not date_range[0] <= line[ 'decision_date'] <= date_range[1]: continue if min_len and len(line['text']) < min_len: continue if text_only is True: yield line['text'] else: yield line n += 1 if n == limit: break
def _iterate(self, text_only, speaker_name=None, speaker_party=None, chamber=None, congress=None, date_range=None, min_len=None, limit=-1): """Note: Use `.texts()` or `.records()` to iterate over corpus data.""" # prepare filters if speaker_name: if isinstance(speaker_name, string_types): speaker_name = {speaker_name} if not all(item in self.speaker_names for item in speaker_name): raise ValueError( 'all values in `speaker_name` must be valid; see `CapitolWords.speaker_names`') if speaker_party: if isinstance(speaker_party, string_types): speaker_party = {speaker_party} if not all(item in self.speaker_parties for item in speaker_party): raise ValueError( 'all values in `speaker_party` must be valid; see `CapitolWords.speaker_parties`') if chamber: if isinstance(chamber, string_types): chamber = {chamber} if not all(item in self.chambers for item in chamber): raise ValueError( 'all values in `chamber` must be valid; see `CapitolWords.chambers`') if congress: if isinstance(congress, int): congress = {congress} if not all(item in self.congresses for item in congress): raise ValueError( 'all values in `congress` must be valid; see `CapitolWords.congresses`') if date_range: if not isinstance(date_range, (list, tuple)): raise ValueError('`date_range` must be a list or tuple, not %s', type(date_range)) if not len(date_range) == 2: raise ValueError('`date_range` must have both start and end values') if not date_range[0]: date_range = (MIN_DATE, date_range[1]) if not date_range[1]: date_range = (date_range[0], MAX_DATE) n = 0 mode = 'rb' if is_python2 else 'rt' for line in read_json_lines(self.filepath, mode=mode): if speaker_name and line['speaker_name'] not in speaker_name: continue if speaker_party and line['speaker_party'] not in speaker_party: continue if chamber and line['chamber'] not in chamber: continue if congress and line['congress'] not in congress: continue if date_range and not date_range[0] <= line['date'] <= date_range[1]: continue if min_len and len(line['text']) < min_len: continue if text_only is True: yield line['text'] else: yield line n += 1 if n == limit: break
def _iterate(self, text_only, subreddit, date_range, score_range, min_len, limit): """Note: Use `.texts()` or `.records()` to iterate over corpus data.""" if subreddit: if isinstance(subreddit, string_types): subreddit = {subreddit} elif isinstance(subreddit, (list, tuple)): subreddit = set(subreddit) if date_range: if not isinstance(date_range, (list, tuple)): msg = '`date_range` must be a list or tuple, not {}'.format( type(date_range)) raise ValueError(msg) if not len(date_range) == 2: msg = '`date_range` must have both start and end values' raise ValueError(msg) if not date_range[0]: date_range = (MIN_DATE, date_range[1]) if not date_range[1]: date_range = (date_range[0], MAX_DATE) if score_range: if not isinstance(score_range, (list, tuple)): msg = '`score_range` must be a list or tuple, not {}'.format( type(score_range)) raise ValueError(msg) if len(score_range) != 2: msg = '`score_range` must have both min and max values' raise ValueError(msg) if not score_range[0]: score_range = (MIN_INT, score_range[1]) if not score_range[1]: score_range = (score_range[0], MAX_INT) n = 0 mode = 'rb' if PY2 else 'rt' # Python 2 can't open json in text mode for path in self.paths: for line in read_json_lines(path, mode=mode): if subreddit and line['subreddit'] not in subreddit: continue if score_range and not score_range[0] <= line[ 'score'] <= score_range[1]: continue line['created_utc'] = self._convert_timestamp( line.get('created_utc', '')) if date_range and not date_range[0] <= line[ 'created_utc'] <= date_range[1]: continue line['body'] = self._clean_content(line['body']) if min_len and len(line['body']) < min_len: continue if text_only is True: yield line['body'] else: line['retrieved_on'] = self._convert_timestamp( line.get('retrieved_on', '')) yield line n += 1 if n == limit: break if n == limit: break
def _iterate(self, text_only, speaker_name, speaker_party, chamber, congress, date_range, min_len, limit): """ Low-level method to iterate over the records in this dataset. Used by :meth:`CapitolWords.texts()` and :meth:`CapitolWords.records()`. """ if not self.filename: raise IOError('{} file not found'.format(self._filename)) if speaker_name: if isinstance(speaker_name, compat.string_types): speaker_name = {speaker_name} if not all(item in self.speaker_names for item in speaker_name): raise ValueError('all values in `speaker_name` must be valid; ' 'see :attr:`CapitolWords.speaker_names`') if speaker_party: if isinstance(speaker_party, compat.string_types): speaker_party = {speaker_party} if not all(item in self.speaker_parties for item in speaker_party): raise ValueError( 'all values in `speaker_party` must be valid; ' 'see :attr:`CapitolWords.speaker_parties`') if chamber: if isinstance(chamber, compat.string_types): chamber = {chamber} if not all(item in self.chambers for item in chamber): raise ValueError('all values in `chamber` must be valid; ' 'see :attr:`CapitolWords.chambers`') if congress: if isinstance(congress, int): congress = {congress} if not all(item in self.congresses for item in congress): raise ValueError('all values in `congress` must be valid; ' 'see :attr:`CapitolWords.congresses`') if date_range: date_range = self._parse_date_range(date_range) n = 0 mode = 'rb' if compat.is_python2 else 'rt' # TODO: check this for line in fileio.read_json_lines(self.filename, mode=mode): if speaker_name and line['speaker_name'] not in speaker_name: continue if speaker_party and line['speaker_party'] not in speaker_party: continue if chamber and line['chamber'] not in chamber: continue if congress and line['congress'] not in congress: continue if date_range and not date_range[0] <= line['date'] <= date_range[ 1]: continue if min_len and len(line['text']) < min_len: continue if text_only is True: yield line['text'] else: yield line n += 1 if n == limit: break
def _iterate( self, text_only, speaker_name=None, speaker_party=None, chamber=None, congress=None, date_range=None, min_len=None, limit=-1, ): """Note: Use `.texts()` or `.records()` to iterate over corpus data.""" # prepare filters if speaker_name: if isinstance(speaker_name, string_types): speaker_name = {speaker_name} if not all(item in self.speaker_names for item in speaker_name): raise ValueError("all values in `speaker_name` must be valid; see `CapitolWords.speaker_names`") if speaker_party: if isinstance(speaker_party, string_types): speaker_party = {speaker_party} if not all(item in self.speaker_parties for item in speaker_party): raise ValueError("all values in `speaker_party` must be valid; see `CapitolWords.speaker_parties`") if chamber: if isinstance(chamber, string_types): chamber = {chamber} if not all(item in self.chambers for item in chamber): raise ValueError("all values in `chamber` must be valid; see `CapitolWords.chambers`") if congress: if isinstance(congress, int): congress = {congress} if not all(item in self.congresses for item in congress): raise ValueError("all values in `congress` must be valid; see `CapitolWords.congresses`") if date_range: if not isinstance(date_range, (list, tuple)): raise ValueError("`date_range` must be a list or tuple, not %s", type(date_range)) if not len(date_range) == 2: raise ValueError("`date_range` must have both start and end values") if not date_range[0]: date_range = (MIN_DATE, date_range[1]) if not date_range[1]: date_range = (date_range[0], MAX_DATE) n = 0 mode = "rb" if PY2 else "rt" for line in read_json_lines(self.filepath, mode=mode): if speaker_name and line["speaker_name"] not in speaker_name: continue if speaker_party and line["speaker_party"] not in speaker_party: continue if chamber and line["chamber"] not in chamber: continue if congress and line["congress"] not in congress: continue if date_range and not date_range[0] <= line["date"] <= date_range[1]: continue if min_len and len(line["text"]) < min_len: continue if text_only is True: yield line["text"] else: yield line n += 1 if n == limit: break