示例#1
0
def fetch_bernie_and_hillary(data_dir=None,
                             download_if_missing=True,
                             shuffle=False):
    """
    Load the Bernie & Hillary corpus from disk (automatically downloading data
    from S3 if necessary and desired).

    Args:
        data_dir (str): path on disk from which corpus will be loaded;
            if None, textacy's default data_dir is used (optional)
        download_if_missing (bool): if True and corpus not found on disk, it will
            be automatically downloaded from S3 and saved to disk (optional)
        shuffle (bool): if True, randomly shuffle order of documents;
            if False, documents are sorted chronologically (optional)

    Returns:
        list[dict]: each item in list corresponds to a speech document

    Raises:
        IOError: if file is not found on disk and `download_if_missing` is False
        HTTPError: if file is not found on disk, `download_if_missing` is True,
            and something goes wrong with the download

    .. warn:: The Bernie & Hillary corpus has been deprecated! Use the newer and
        more comprehensive CapitolWords corpus instead. To recreate B&H, filter
        CapitolWords speeches by `speaker_name={'Bernie Sanders', 'Hillary Clinton'}`.
    """
    with warnings.catch_warnings():
        warnings.simplefilter('always', DeprecationWarning)
        msg = """
            The Bernie & Hillary corpus has been deprecated! Use the newer and
            more comprehensive CapitolWords corpus instead. To recreate B&H,
            filter CapitolWords speeches by `speaker_name={'Bernie Sanders', 'Hillary Clinton'}`.
            """
        warnings.warn(msg.strip(),
                      DeprecationWarning)
    if data_dir is None:
        data_dir = DEFAULT_DATA_DIR
    fname = os.path.join(data_dir, FNAME)
    try:
        data = list(read_json_lines(fname, mode='rt', encoding=None))
    except (OSError, IOError):
        if download_if_missing is True:
            _download_bernie_and_hillary(data_dir=data_dir)
            data = list(read_json_lines(fname, mode='rt', encoding=None))
        else:
            logger.exception('unable to load corpus from %s', fname)
            raise
    logger.info('loading corpus from %s', fname)

    if shuffle is True:
        random.shuffle(data)

    return data
示例#2
0
def fetch_bernie_and_hillary(data_dir=None,
                             download_if_missing=True,
                             shuffle=False):
    """
    Load the Bernie & Hillary corpus from disk (automatically downloading data
    from S3 if necessary and desired).

    Args:
        data_dir (str): path on disk from which corpus will be loaded;
            if None, textacy's default data_dir is used (optional)
        download_if_missing (bool): if True and corpus not found on disk, it will
            be automatically downloaded from S3 and saved to disk (optional)
        shuffle (bool): if True, randomly shuffle order of documents;
            if False, documents are sorted chronologically (optional)

    Returns:
        list[dict]: each item in list corresponds to a speech document

    Raises:
        IOError: if file is not found on disk and `download_if_missing` is False
        HTTPError: if file is not found on disk, `download_if_missing` is True,
            and something goes wrong with the download

    .. warn:: The Bernie & Hillary corpus has been deprecated! Use the newer and
        more comprehensive CapitolWords corpus instead. To recreate B&H, filter
        CapitolWords speeches by `speaker_name={'Bernie Sanders', 'Hillary Clinton'}`.
    """
    with warnings.catch_warnings():
        warnings.simplefilter('always', DeprecationWarning)
        msg = """
            The Bernie & Hillary corpus has been deprecated! Use the newer and
            more comprehensive CapitolWords corpus instead. To recreate B&H,
            filter CapitolWords speeches by `speaker_name={'Bernie Sanders', 'Hillary Clinton'}`.
            """
        warnings.warn(msg.strip(), DeprecationWarning)
    if data_dir is None:
        data_dir = DEFAULT_DATA_DIR
    fname = os.path.join(data_dir, FNAME)
    try:
        data = list(read_json_lines(fname, mode='rt', encoding=None))
    except (OSError, IOError):
        if download_if_missing is True:
            _download_bernie_and_hillary(data_dir=data_dir)
            data = list(read_json_lines(fname, mode='rt', encoding=None))
        else:
            logger.exception('unable to load corpus from %s', fname)
            raise
    logger.info('loading corpus from %s', fname)

    if shuffle is True:
        random.shuffle(data)

    return data
示例#3
0
    def _iterate(self, text_only, subreddit, date_range, score_range, min_len,
                 limit):
        """
        Low-level method to iterate over the records in this dataset. Used by
        :meth:`RedditComments.texts()` and :meth:`RedditComments.records()`.
        """
        if subreddit:
            if isinstance(subreddit, compat.string_types):
                subreddit = {subreddit}
            elif isinstance(subreddit, (list, tuple)):
                subreddit = set(subreddit)
        if score_range:
            score_range = self._parse_score_range(score_range)
        if date_range:
            date_range = self._parse_date_range(date_range)
            needed_filenames = {
                os.path.join(self.data_dir, fstub)
                for fstub in self._generate_filestubs(date_range)
            }
            filenames = tuple(fname for fname in self.filenames
                              if fname in needed_filenames)
        else:
            filenames = self.filenames

        if not filenames:
            raise IOError(
                'No files found at {} corresponding to date range {}'.format(
                    self.data_dir, date_range))

        n = 0
        for filename in filenames:
            for line in fileio.read_json_lines(filename, mode='rb'):

                if subreddit and line['subreddit'] not in subreddit:
                    continue
                if score_range and not score_range[0] <= line[
                        'score'] < score_range[1]:
                    continue
                line['created_utc'] = self._convert_timestamp(
                    line.get('created_utc', ''))
                if date_range and not date_range[0] <= line[
                        'created_utc'] < date_range[1]:
                    continue
                line['body'] = self._clean_content(line['body'])
                if min_len and len(line['body']) < min_len:
                    continue

                if text_only is True:
                    yield line['body']
                else:
                    line['retrieved_on'] = self._convert_timestamp(
                        line.get('retrieved_on', ''))
                    yield line

                n += 1
                if n == limit:
                    break

            if n == limit:
                break
示例#4
0
 def test_read_write_json_lines(self):
     expected = [{'idx': i, 'sent': sent.text}
                 for i, sent in enumerate(self.spacy_doc.sents)]
     filename = os.path.join(self.tempdir, 'test_read_write_json_lines.json')
     fileio.write_json_lines(expected, filename)
     observed = list(fileio.read_json_lines(filename))
     self.assertEqual(observed, expected)
示例#5
0
    def _iterate(self, text_only, subreddit, date_range, score_range, min_len, limit):
        """Note: Use `.texts()` or `.records()` to iterate over corpus data."""
        if subreddit:
            if isinstance(subreddit, string_types):
                subreddit = {subreddit}
            elif isinstance(subreddit, (list, tuple)):
                subreddit = set(subreddit)
        if date_range:
            if not isinstance(date_range, (list, tuple)):
                msg = "`date_range` must be a list or tuple, not {}".format(type(date_range))
                raise ValueError(msg)
            if not len(date_range) == 2:
                msg = "`date_range` must have both start and end values"
                raise ValueError(msg)
            if not date_range[0]:
                date_range = (MIN_DATE, date_range[1])
            if not date_range[1]:
                date_range = (date_range[0], MAX_DATE)
        if score_range:
            if not isinstance(score_range, (list, tuple)):
                msg = "`score_range` must be a list or tuple, not {}".format(type(score_range))
                raise ValueError(msg)
            if len(score_range) != 2:
                msg = "`score_range` must have both min and max values"
                raise ValueError(msg)
            if not score_range[0]:
                score_range = (MIN_INT, score_range[1])
            if not score_range[1]:
                score_range = (score_range[0], MAX_INT)

        n = 0
        mode = "rb" if PY2 else "rt"  # Python 2 can't open json in text mode
        for path in self.paths:
            for line in read_json_lines(path, mode=mode):

                if subreddit and line["subreddit"] not in subreddit:
                    continue
                if score_range and not score_range[0] <= line["score"] <= score_range[1]:
                    continue
                line["created_utc"] = self._convert_timestamp(line.get("created_utc", ""))
                if date_range and not date_range[0] <= line["created_utc"] <= date_range[1]:
                    continue
                line["body"] = self._clean_content(line["body"])
                if min_len and len(line["body"]) < min_len:
                    continue

                if text_only is True:
                    yield line["body"]
                else:
                    line["retrieved_on"] = self._convert_timestamp(line.get("retrieved_on", ""))
                    yield line

                n += 1
                if n == limit:
                    break

            if n == limit:
                break
示例#6
0
    def load(cls, path, fname_prefix=None, compression=None):
        """
        Load serialized content and metadata from disk, and initialize a TextCorpus.

        Args:
            path (str): directory on disk where content + metadata are saved
            fname_prefix (str, optional): additional identifying information
                prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json'
                when saving to disk
            compression ({'gzip', 'bz2', 'lzma'} or None): type of compression
                used to reduce size of metadatas json file

        Returns:
            :class:`textacy.TextCorpus`

        .. warn:: If the `spacy.Vocab` object used to save this corpus is not the
            same as the one used to load it, there will be problems! Consequently,
            this functionality is only useful as short-term but not long-term storage.
        """
        if fname_prefix:
            info_fname = os.path.join(path,
                                      '_'.join([fname_prefix, 'info.json']))
            meta_fname = os.path.join(
                path, '_'.join([fname_prefix, 'metadatas.json']))
            docs_fname = os.path.join(
                path, '_'.join([fname_prefix, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip' else
                                   '.bz2' if compression == 'bz2' else
                                   '.xz' if compression == 'lzma' else '')
        meta_mode = 'rt' if PY2 is False or compression is None else 'rb'
        package_info = list(fileio.read_json(info_fname))[0]
        lang = package_info['textacy_lang']
        spacy_version = package_info['spacy_version']
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this TextCorpus to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded TextCorpus may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        textcorpus = TextCorpus(lang)
        metadata_stream = fileio.read_json_lines(
            meta_fname,
            mode=meta_mode,
        )
        spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            textcorpus.add_doc(
                TextDoc(spacy_doc,
                        spacy_pipeline=textcorpus.spacy_pipeline,
                        lang=lang,
                        metadata=metadata))
        return textcorpus
示例#7
0
    def _iterate(self, text_only, opinion_author=None, decision_direction=None,
                 issue_area=None, date_range=None, min_len=None, limit=-1):
        """Note: Use `.texts()` or `.records()` to iterate over corpus data."""
        # prepare filters
        if opinion_author:
            if isinstance(opinion_author, int):
                opinion_author = {opinion_author}
            if not all(oa in self.opinion_author_codes for oa in opinion_author):
                msg = 'invalid `opinion_author` value; see `SupremeCourt.opinion_author_codes`'
                raise ValueError(msg)
        if issue_area:
            if isinstance(issue_area, int):
                issue_area = {issue_area}
            if not all(ii in self.issue_area_codes for ii in issue_area):
                msg = 'invalid `issue_area` value; see `SupremeCourt.issue_area_codes`'
                raise ValueError(msg)
        if decision_direction:
            if isinstance(decision_direction, string_types):
                decision_direction = {decision_direction}
            if not all(dd in self.decision_directions for dd in decision_direction):
                msg = 'invalid `decision_direction` value; see `SupremeCourt.decision_directions`'
                raise ValueError(msg)
        if date_range:
            if not isinstance(date_range, (list, tuple)):
                msg = '`date_range` must be a list or tuple, not {}'.format(type(date_range))
                raise ValueError(msg)
            if not len(date_range) == 2:
                msg = '`date_range` must have both start and end values'
                raise ValueError(msg)
            if not date_range[0]:
                date_range = (MIN_DATE, date_range[1])
            if not date_range[1]:
                date_range = (date_range[0], MAX_DATE)

        n = 0
        mode = 'rb' if PY2 else 'rt'
        for line in read_json_lines(self.filepath, mode=mode):
            if opinion_author and line['maj_opinion_author'] not in opinion_author:
                continue
            if issue_area and line['issue_area'] not in issue_area:
                continue
            if decision_direction and line['decision_direction'] not in decision_direction:
                continue
            if date_range and not date_range[0] <= line['decision_date'] <= date_range[1]:
                continue
            if min_len and len(line['text']) < min_len:
                continue

            if text_only is True:
                yield line['text']
            else:
                yield line

            n += 1
            if n == limit:
                break
示例#8
0
    def _iterate(self, text_only, opinion_author=None, decision_direction=None,
                 issue_area=None, date_range=None, min_len=None, limit=-1):
        """Note: Use `.texts()` or `.records()` to iterate over corpus data."""
        # prepare filters
        if opinion_author:
            if isinstance(opinion_author, int):
                opinion_author = {opinion_author}
            if not all(oa in self.opinion_author_codes for oa in opinion_author):
                msg = 'invalid `opinion_author` value; see `SupremeCourt.opinion_author_codes`'
                raise ValueError(msg)
        if issue_area:
            if isinstance(issue_area, int):
                issue_area = {issue_area}
            if not all(ii in self.issue_area_codes for ii in issue_area):
                msg = 'invalid `issue_area` value; see `SupremeCourt.issue_area_codes`'
                raise ValueError(msg)
        if decision_direction:
            if isinstance(decision_direction, string_types):
                decision_direction = {decision_direction}
            if not all(dd in self.decision_directions for dd in decision_direction):
                msg = 'invalid `decision_direction` value; see `SupremeCourt.decision_directions`'
                raise ValueError(msg)
        if date_range:
            if not isinstance(date_range, (list, tuple)):
                msg = '`date_range` must be a list or tuple, not {}'.format(type(date_range))
                raise ValueError(msg)
            if not len(date_range) == 2:
                msg = '`date_range` must have both start and end values'
                raise ValueError(msg)
            if not date_range[0]:
                date_range = (MIN_DATE, date_range[1])
            if not date_range[1]:
                date_range = (date_range[0], MAX_DATE)

        n = 0
        mode = 'rb' if is_python2 else 'rt'
        for line in read_json_lines(self.filepath, mode=mode):
            if opinion_author and line['maj_opinion_author'] not in opinion_author:
                continue
            if issue_area and line['issue_area'] not in issue_area:
                continue
            if decision_direction and line['decision_direction'] not in decision_direction:
                continue
            if date_range and not date_range[0] <= line['decision_date'] <= date_range[1]:
                continue
            if min_len and len(line['text']) < min_len:
                continue

            if text_only is True:
                yield line['text']
            else:
                yield line

            n += 1
            if n == limit:
                break
示例#9
0
 def test_read_write_json_lines(self):
     expected = [{
         'idx': i,
         'sent': sent.text
     } for i, sent in enumerate(self.spacy_doc.sents)]
     filename = os.path.join(self.tempdir,
                             'test_read_write_json_lines.json')
     fileio.write_json_lines(expected, filename)
     observed = list(fileio.read_json_lines(filename))
     self.assertEqual(observed, expected)
示例#10
0
def fetch_bernie_and_hillary(data_dir=None,
                             download_if_missing=True,
                             shuffle=False):
    """
    Load the Bernie & Hillary corpus from disk (automatically downloading data
    from S3 if necessary and desired).

    Args:
        data_dir (str): path on disk from which corpus will be loaded;
            if None, textacy's default data_dir is used (optional)
        download_if_missing (bool): if True and corpus not found on disk, it will
            be automatically downloaded from S3 and saved to disk (optional)
        shuffle (bool): if True, randomly shuffle order of documents;
            if False, documents are sorted chronologically (optional)

    Returns:
        list[dict]: each item in list corresponds to a speech document

    Raises:
        IOError: if file is not found on disk and `download_if_missing` is False
        HTTPError: if file is not found on disk, `download_if_missing` is True,
            and something goes wrong with the download
    """
    if data_dir is None:
        data_dir = DEFAULT_DATA_DIR
    fname = os.path.join(data_dir, FNAME)
    try:
        data = list(read_json_lines(fname, mode='rt', encoding=None))
    except (OSError, IOError):
        if download_if_missing is True:
            _download_bernie_and_hillary(data_dir=data_dir)
            data = list(read_json_lines(fname, mode='rt', encoding=None))
        else:
            logger.exception('unable to load corpus from %s', fname)
            raise
    logger.info('loading corpus from %s', fname)

    if shuffle is True:
        random.shuffle(data)

    return data
示例#11
0
def fetch_bernie_and_hillary(data_dir=None,
                             download_if_missing=True,
                             shuffle=False):
    """
    Load the Bernie & Hillary corpus from disk (automatically downloading data
    from S3 if necessary and desired).

    Args:
        data_dir (str): path on disk from which corpus will be loaded;
            if None, textacy's default data_dir is used (optional)
        download_if_missing (bool): if True and corpus not found on disk, it will
            be automatically downloaded from S3 and saved to disk (optional)
        shuffle (bool): if True, randomly shuffle order of documents;
            if False, documents are sorted chronologically (optional)

    Returns:
        list[dict]: each item in list corresponds to a speech document

    Raises:
        IOError: if file is not found on disk and `download_if_missing` is False
        HTTPError: if file is not found on disk, `download_if_missing` is True,
            and something goes wrong with the download
    """
    if data_dir is None:
        data_dir = DEFAULT_DATA_DIR
    fname = os.path.join(data_dir, FNAME)
    try:
        data = list(read_json_lines(fname, mode='rt', encoding=None))
    except (OSError, IOError):
        if download_if_missing is True:
            _download_bernie_and_hillary(data_dir=data_dir)
            data = list(read_json_lines(fname, mode='rt', encoding=None))
        else:
            logger.exception('unable to load corpus from %s', fname)
            raise
    logger.info('loading corpus from %s', fname)

    if shuffle is True:
        random.shuffle(data)

    return data
示例#12
0
    def load(cls, path, name=None, compression=None):
        """
        Load content and metadata from disk, and initialize a ``Corpus``.

        Args:
            path (str): Directory on disk where content + metadata are saved.
            name (str): Identifying/uniquifying name prepended to the default
                filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json',
                used when corpus was saved to disk via :meth:`Corpus.save()`.
            compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression
                used to reduce size of 'metadatas.json' file when saved, if any.

        Returns:
            :class:`textacy.Corpus <Corpus>`

        .. warning:: If the ``spacy.Vocab`` object used to save this document is
            not the same as the one used to load it, there will be problems!
            Consequently, this functionality is only useful as short-term but
            not long-term storage.
        """
        if name:
            info_fname = os.path.join(path, '_'.join([name, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip'
                                   else '.bz2' if compression == 'bz2'
                                   else '.xz' if compression == 'lzma'
                                   else '')
        meta_mode = 'rt' if PY2 is False or compression is None else 'rb'
        package_info = list(fileio.read_json(info_fname))[0]
        lang = package_info['textacy_lang']
        spacy_version = package_info['spacy_version']
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this Corpus to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded Corpus may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        corpus = Corpus(lang)
        metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode)
        spacy_docs = fileio.read_spacy_docs(corpus.spacy_vocab, docs_fname)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            corpus.add_doc(
                Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
        return corpus
示例#13
0
    def load(cls, path, fname_prefix=None, compression=None):
        """
        Load serialized content and metadata from disk, and initialize a TextCorpus.

        Args:
            path (str): directory on disk where content + metadata are saved
            fname_prefix (str, optional): additional identifying information
                prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json'
                when saving to disk
            compression ({'gzip', 'bz2', 'lzma'} or None): type of compression
                used to reduce size of metadatas json file

        Returns:
            :class:`textacy.TextCorpus`

        .. warn:: If the `spacy.Vocab` object used to save this corpus is not the
            same as the one used to load it, there will be problems! Consequently,
            this functionality is only useful as short-term but not long-term storage.
        """
        if fname_prefix:
            info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip'
                                   else '.bz2' if compression == 'bz2'
                                   else '.xz' if compression == 'lzma'
                                   else '')
        meta_mode = 'rt' if PY2 is False or compression is None else 'rb'
        package_info = list(fileio.read_json(info_fname))[0]
        lang = package_info['textacy_lang']
        spacy_version = package_info['spacy_version']
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this TextCorpus to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded TextCorpus may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        textcorpus = TextCorpus(lang)
        metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode,)
        spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            textcorpus.add_doc(
                TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline,
                        lang=lang, metadata=metadata))
        return textcorpus
示例#14
0
    def load(cls, path, name=None, compression=None):
        """
        Load content and metadata from disk, and initialize a ``Corpus``.

        Args:
            path (str): Directory on disk where content + metadata are saved.
            name (str): Identifying/uniquifying name prepended to the default
                filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json',
                used when corpus was saved to disk via :meth:`Corpus.save()`.
            compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression
                used to reduce size of 'metadatas.json' file when saved, if any.

        Returns:
            :class:`textacy.Corpus <Corpus>`

        .. warning:: If the ``spacy.Vocab`` object used to save this document is
            not the same as the one used to load it, there will be problems!
            Consequently, this functionality is only useful as short-term but
            not long-term storage.
        """
        if name:
            info_fname = os.path.join(path, '_'.join([name, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip'
                                   else '.bz2' if compression == 'bz2'
                                   else '.xz' if compression == 'lzma'
                                   else '')
        meta_mode = 'rt' if PY2 is False or compression is None else 'rb'
        package_info = list(fileio.read_json(info_fname))[0]
        lang = package_info['textacy_lang']
        spacy_version = package_info['spacy_version']
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this Corpus to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded Corpus may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        corpus = Corpus(lang)
        metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode)
        spacy_docs = fileio.read_spacy_docs(corpus.spacy_vocab, docs_fname)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            corpus.add_doc(
                Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
        return corpus
示例#15
0
 def test_read_write_json_lines_unicode(self):
     expected = [{'idx': i, 'sent': sent.text}
                 for i, sent in enumerate(self.spacy_doc.sents)]
     for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_json_lines_unicode' + ext)
         if PY2 is True and ext != '.json':
             self.assertRaises(
                 ValueError, fileio.open_sesame,
                 filename, 'wt', None, True)
         else:
             fileio.write_json_lines(expected, filename, mode='wt',
                                     auto_make_dirs=True)
             observed = list(fileio.read_json_lines(filename, mode='rt'))
             self.assertEqual(observed, expected)
示例#16
0
 def test_read_write_json_lines_unicode(self):
     expected = [{'idx': i, 'sent': sent.text}
                 for i, sent in enumerate(self.spacy_doc.sents)]
     for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_json_lines_unicode' + ext)
         if PY2 is True and ext != '.json':
             self.assertRaises(
                 ValueError, fileio.open_sesame,
                 filename, 'wt', None, True)
         else:
             fileio.write_json_lines(expected, filename, mode='wt',
                                     auto_make_dirs=True)
             observed = list(fileio.read_json_lines(filename, mode='rt'))
             self.assertEqual(observed, expected)
示例#17
0
文件: texts.py 项目: licyeus/textacy
    def load(cls, path, fname_prefix=None):
        """
        Load serialized content and metadata from disk, and initialize a TextCorpus.

        Args:
            path (str): directory on disk where content + metadata are saved
            fname_prefix (str, optional): additional identifying information
                prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json'
                when saving to disk

        Returns:
            :class:`textacy.TextCorpus`
        """
        if fname_prefix:
            info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        package_info = list(fileio.read_json(info_fname))[0]
        lang = package_info['textacy_lang']
        spacy_version = package_info['spacy_version']
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this TextCorpus to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded TextCorpus may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        textcorpus = TextCorpus(lang)
        metadata_stream = fileio.read_json_lines(meta_fname)
        spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            textcorpus.add_doc(
                TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline,
                        lang=lang, metadata=metadata))
        return textcorpus
示例#18
0
 def test_read_write_json_lines_bytes(self):
     expected = [{
         'idx': i,
         'sent': sent.text
     } for i, sent in enumerate(self.spacy_doc.sents)]
     for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'):
         filename = os.path.join(self.tempdir,
                                 'test_read_write_json_lines_bytes' + ext)
         if is_python2 is True:
             if ext == '.json.xz':
                 self.assertRaises(ValueError, fileio.open_sesame, filename,
                                   'wb', 'utf-8', True)
             else:
                 fileio.write_json_lines(expected,
                                         filename,
                                         mode='wb',
                                         auto_make_dirs=True)
                 observed = list(fileio.read_json_lines(filename,
                                                        mode='rb'))
                 self.assertEqual(observed, expected)
         else:
             self.assertRaises(TypeError, fileio.write_json_lines, expected,
                               filename, 'wb', None, True)
示例#19
0
    def _iterate(self,
                 text_only,
                 opinion_author=None,
                 decision_direction=None,
                 issue_area=None,
                 date_range=None,
                 min_len=None,
                 limit=-1):
        """
        Low-level method to iterate over the records in this dataset. Used by
        :meth:`SupremeCourt.texts()` and :meth:`SupremeCourt.records()`.
        """
        if not self.filename:
            raise IOError('{} file not found'.format(self._filename))

        if opinion_author:
            if isinstance(opinion_author, int):
                opinion_author = {opinion_author}
            if not all(oa in self.opinion_author_codes
                       for oa in opinion_author):
                msg = 'invalid `opinion_author` value; see `SupremeCourt.opinion_author_codes`'
                raise ValueError(msg)
        if issue_area:
            if isinstance(issue_area, int):
                issue_area = {issue_area}
            if not all(ii in self.issue_area_codes for ii in issue_area):
                msg = 'invalid `issue_area` value; see `SupremeCourt.issue_area_codes`'
                raise ValueError(msg)
        if decision_direction:
            if isinstance(decision_direction, compat.string_types):
                decision_direction = {decision_direction}
            if not all(dd in self.decision_directions
                       for dd in decision_direction):
                msg = 'invalid `decision_direction` value; see `SupremeCourt.decision_directions`'
                raise ValueError(msg)
        if date_range:
            date_range = self._parse_date_range(date_range)

        n = 0
        mode = 'rb' if compat.is_python2 else 'rt'
        for line in fileio.read_json_lines(self.filename, mode=mode):
            if opinion_author and line[
                    'maj_opinion_author'] not in opinion_author:
                continue
            if issue_area and line['issue_area'] not in issue_area:
                continue
            if decision_direction and line[
                    'decision_direction'] not in decision_direction:
                continue
            if date_range and not date_range[0] <= line[
                    'decision_date'] <= date_range[1]:
                continue
            if min_len and len(line['text']) < min_len:
                continue

            if text_only is True:
                yield line['text']
            else:
                yield line

            n += 1
            if n == limit:
                break
示例#20
0
    def _iterate(self, text_only, speaker_name=None, speaker_party=None,
                 chamber=None, congress=None, date_range=None, min_len=None,
                 limit=-1):
        """Note: Use `.texts()` or `.records()` to iterate over corpus data."""
        # prepare filters
        if speaker_name:
            if isinstance(speaker_name, string_types):
                speaker_name = {speaker_name}
            if not all(item in self.speaker_names for item in speaker_name):
                raise ValueError(
                    'all values in `speaker_name` must be valid; see `CapitolWords.speaker_names`')
        if speaker_party:
            if isinstance(speaker_party, string_types):
                speaker_party = {speaker_party}
            if not all(item in self.speaker_parties for item in speaker_party):
                raise ValueError(
                    'all values in `speaker_party` must be valid; see `CapitolWords.speaker_parties`')
        if chamber:
            if isinstance(chamber, string_types):
                chamber = {chamber}
            if not all(item in self.chambers for item in chamber):
                raise ValueError(
                    'all values in `chamber` must be valid; see `CapitolWords.chambers`')
        if congress:
            if isinstance(congress, int):
                congress = {congress}
            if not all(item in self.congresses for item in congress):
                raise ValueError(
                    'all values in `congress` must be valid; see `CapitolWords.congresses`')
        if date_range:
            if not isinstance(date_range, (list, tuple)):
                raise ValueError('`date_range` must be a list or tuple, not %s', type(date_range))
            if not len(date_range) == 2:
                raise ValueError('`date_range` must have both start and end values')
            if not date_range[0]:
                date_range = (MIN_DATE, date_range[1])
            if not date_range[1]:
                date_range = (date_range[0], MAX_DATE)

        n = 0
        mode = 'rb' if is_python2 else 'rt'
        for line in read_json_lines(self.filepath, mode=mode):
            if speaker_name and line['speaker_name'] not in speaker_name:
                continue
            if speaker_party and line['speaker_party'] not in speaker_party:
                continue
            if chamber and line['chamber'] not in chamber:
                continue
            if congress and line['congress'] not in congress:
                continue
            if date_range and not date_range[0] <= line['date'] <= date_range[1]:
                continue
            if min_len and len(line['text']) < min_len:
                continue

            if text_only is True:
                yield line['text']
            else:
                yield line

            n += 1
            if n == limit:
                break
示例#21
0
    def _iterate(self, text_only, subreddit, date_range, score_range, min_len,
                 limit):
        """Note: Use `.texts()` or `.records()` to iterate over corpus data."""
        if subreddit:
            if isinstance(subreddit, string_types):
                subreddit = {subreddit}
            elif isinstance(subreddit, (list, tuple)):
                subreddit = set(subreddit)
        if date_range:
            if not isinstance(date_range, (list, tuple)):
                msg = '`date_range` must be a list or tuple, not {}'.format(
                    type(date_range))
                raise ValueError(msg)
            if not len(date_range) == 2:
                msg = '`date_range` must have both start and end values'
                raise ValueError(msg)
            if not date_range[0]:
                date_range = (MIN_DATE, date_range[1])
            if not date_range[1]:
                date_range = (date_range[0], MAX_DATE)
        if score_range:
            if not isinstance(score_range, (list, tuple)):
                msg = '`score_range` must be a list or tuple, not {}'.format(
                    type(score_range))
                raise ValueError(msg)
            if len(score_range) != 2:
                msg = '`score_range` must have both min and max values'
                raise ValueError(msg)
            if not score_range[0]:
                score_range = (MIN_INT, score_range[1])
            if not score_range[1]:
                score_range = (score_range[0], MAX_INT)

        n = 0
        mode = 'rb' if PY2 else 'rt'  # Python 2 can't open json in text mode
        for path in self.paths:
            for line in read_json_lines(path, mode=mode):

                if subreddit and line['subreddit'] not in subreddit:
                    continue
                if score_range and not score_range[0] <= line[
                        'score'] <= score_range[1]:
                    continue
                line['created_utc'] = self._convert_timestamp(
                    line.get('created_utc', ''))
                if date_range and not date_range[0] <= line[
                        'created_utc'] <= date_range[1]:
                    continue
                line['body'] = self._clean_content(line['body'])
                if min_len and len(line['body']) < min_len:
                    continue

                if text_only is True:
                    yield line['body']
                else:
                    line['retrieved_on'] = self._convert_timestamp(
                        line.get('retrieved_on', ''))
                    yield line

                n += 1
                if n == limit:
                    break

            if n == limit:
                break
示例#22
0
    def _iterate(self, text_only, speaker_name, speaker_party, chamber,
                 congress, date_range, min_len, limit):
        """
        Low-level method to iterate over the records in this dataset. Used by
        :meth:`CapitolWords.texts()` and :meth:`CapitolWords.records()`.
        """
        if not self.filename:
            raise IOError('{} file not found'.format(self._filename))

        if speaker_name:
            if isinstance(speaker_name, compat.string_types):
                speaker_name = {speaker_name}
            if not all(item in self.speaker_names for item in speaker_name):
                raise ValueError('all values in `speaker_name` must be valid; '
                                 'see :attr:`CapitolWords.speaker_names`')
        if speaker_party:
            if isinstance(speaker_party, compat.string_types):
                speaker_party = {speaker_party}
            if not all(item in self.speaker_parties for item in speaker_party):
                raise ValueError(
                    'all values in `speaker_party` must be valid; '
                    'see :attr:`CapitolWords.speaker_parties`')
        if chamber:
            if isinstance(chamber, compat.string_types):
                chamber = {chamber}
            if not all(item in self.chambers for item in chamber):
                raise ValueError('all values in `chamber` must be valid; '
                                 'see :attr:`CapitolWords.chambers`')
        if congress:
            if isinstance(congress, int):
                congress = {congress}
            if not all(item in self.congresses for item in congress):
                raise ValueError('all values in `congress` must be valid; '
                                 'see :attr:`CapitolWords.congresses`')
        if date_range:
            date_range = self._parse_date_range(date_range)

        n = 0
        mode = 'rb' if compat.is_python2 else 'rt'  # TODO: check this
        for line in fileio.read_json_lines(self.filename, mode=mode):

            if speaker_name and line['speaker_name'] not in speaker_name:
                continue
            if speaker_party and line['speaker_party'] not in speaker_party:
                continue
            if chamber and line['chamber'] not in chamber:
                continue
            if congress and line['congress'] not in congress:
                continue
            if date_range and not date_range[0] <= line['date'] <= date_range[
                    1]:
                continue
            if min_len and len(line['text']) < min_len:
                continue

            if text_only is True:
                yield line['text']
            else:
                yield line

            n += 1
            if n == limit:
                break
示例#23
0
    def _iterate(
        self,
        text_only,
        speaker_name=None,
        speaker_party=None,
        chamber=None,
        congress=None,
        date_range=None,
        min_len=None,
        limit=-1,
    ):
        """Note: Use `.texts()` or `.records()` to iterate over corpus data."""
        # prepare filters
        if speaker_name:
            if isinstance(speaker_name, string_types):
                speaker_name = {speaker_name}
            if not all(item in self.speaker_names for item in speaker_name):
                raise ValueError("all values in `speaker_name` must be valid; see `CapitolWords.speaker_names`")
        if speaker_party:
            if isinstance(speaker_party, string_types):
                speaker_party = {speaker_party}
            if not all(item in self.speaker_parties for item in speaker_party):
                raise ValueError("all values in `speaker_party` must be valid; see `CapitolWords.speaker_parties`")
        if chamber:
            if isinstance(chamber, string_types):
                chamber = {chamber}
            if not all(item in self.chambers for item in chamber):
                raise ValueError("all values in `chamber` must be valid; see `CapitolWords.chambers`")
        if congress:
            if isinstance(congress, int):
                congress = {congress}
            if not all(item in self.congresses for item in congress):
                raise ValueError("all values in `congress` must be valid; see `CapitolWords.congresses`")
        if date_range:
            if not isinstance(date_range, (list, tuple)):
                raise ValueError("`date_range` must be a list or tuple, not %s", type(date_range))
            if not len(date_range) == 2:
                raise ValueError("`date_range` must have both start and end values")
            if not date_range[0]:
                date_range = (MIN_DATE, date_range[1])
            if not date_range[1]:
                date_range = (date_range[0], MAX_DATE)

        n = 0
        mode = "rb" if PY2 else "rt"
        for line in read_json_lines(self.filepath, mode=mode):
            if speaker_name and line["speaker_name"] not in speaker_name:
                continue
            if speaker_party and line["speaker_party"] not in speaker_party:
                continue
            if chamber and line["chamber"] not in chamber:
                continue
            if congress and line["congress"] not in congress:
                continue
            if date_range and not date_range[0] <= line["date"] <= date_range[1]:
                continue
            if min_len and len(line["text"]) < min_len:
                continue

            if text_only is True:
                yield line["text"]
            else:
                yield line

            n += 1
            if n == limit:
                break