Пример #1
0
    def log_artifact(self, local_file, artifact_path=None):
        if artifact_path is None:
            artifact_path = "./"
        dest_path = posixpath.join(artifact_path, os.path.basename(local_file))

        datasets_path = self._datasets_path(dest_path)
        datasets.put(local_file, datasets_path, self.project_id)
Пример #2
0
def test_put_directory(mocker, mock_client):
    posixpath_dirname_mock = mocker.patch("posixpath.dirname",
                                          return_value="/")

    os_path_isdir_mock = mocker.patch("os.path.isdir",
                                      side_effect=[True, False])

    entry_mock = "test-file"
    os_listdir_mock = mocker.patch("os.listdir", return_value=[entry_mock])

    _put_file_mock = mocker.patch("faculty.datasets._put_file")

    datasets.put("local-path", "project-path", PROJECT_ID)

    posixpath_dirname_mock.assert_called_once_with("project-path")
    mock_client.create_directory.assert_has_calls([
        mocker.call(PROJECT_ID, "/", parents=True),
        mocker.call(PROJECT_ID, "project-path"),
    ])
    os_path_isdir_mock.assert_has_calls(
        [mocker.call("local-path"),
         mocker.call("local-path/test-file")])
    os_listdir_mock.assert_called_once_with("local-path")
    _put_file_mock.assert_called_once_with(
        "local-path/test-file",
        "project-path/test-file",
        PROJECT_ID,
        mock_client,
    )
Пример #3
0
def fetch_and_save_articles_in_date_range(from_date, to_date):
    # Fetch raw articles
    raw_articles = RawArticle.get_raw_articles(from_date.strftime('%Y-%m-%d'),
                                               to_date.strftime('%Y-%m-%d'))

    # Build articles and insert into database
    articles = Article.build_articles(raw_articles)
    # Article.bulk_insert(articles)

    # Store raw article content in datasets for later analysis
    df = pd.DataFrame.from_records([
        {
            'article_title': x.title,
            'article_uuid': x.article_uuid,
            'article_url': x.url,
            'article_description': x.description,
            'source_id': x.source_id,
            'published_at': x.published_at,
            'named_entities': x.named_entities,
            'raw_content': x.raw_content,
        } for x in articles
        if x.title is not None and x.description is not None
    ]).drop_duplicates(subset='article_url').reset_index(drop=True)
    tmp = tempfile.NamedTemporaryFile()
    with open(tmp.name, 'w') as f:
        df.to_csv(tmp.name, sep='\t', encoding='utf-8', index=False)
    date_str = from_date.strftime('%Y-%m-%d')
    datasets.put(tmp.name, f'/input/article_content/{date_str}.csv')
Пример #4
0
def test_put_file(mocker, mock_client):
    posixpath_dirname_mock = mocker.patch("posixpath.dirname",
                                          return_value="/")

    os_path_isdir_mock = mocker.patch("os.path.isdir", return_value=False)

    upload_mock = mocker.patch("faculty.datasets.transfer.upload_file")

    datasets.put("local-path", "project-path", PROJECT_ID)

    posixpath_dirname_mock.assert_called_once_with("project-path")
    mock_client.create_directory.assert_called_once_with(PROJECT_ID,
                                                         "/",
                                                         parents=True)
    os_path_isdir_mock.assert_called_once_with("local-path")
    upload_mock.assert_called_once_with(mock_client, PROJECT_ID,
                                        "project-path", "local-path")
Пример #5
0
def save_data_frame(df):
    remove_file(TMP_CSV)
    df.to_csv(TMP_CSV, sep='\t', encoding='utf-8', index=False)
    datasets.put(TMP_CSV, TODAY_CSV)
Пример #6
0
 def log_artifacts(self, local_dir, artifact_path=None):
     if artifact_path is None:
         artifact_path = "./"
     datasets_path = self._datasets_path(artifact_path)
     datasets.put(local_dir, datasets_path, self.project_id)
Пример #7
0
from_date = datetime.today().date() - timedelta(days=1)
to_date = datetime.today().date()

# Fetch raw articles
raw_articles = RawArticle.get_raw_articles(from_date.strftime('%Y-%m-%d'),
                                           to_date.strftime('%Y-%m-%d'))

# Build articles and insert into database
articles = Article.build_articles(raw_articles)
# Article.bulk_insert(articles)

# Store raw article content in datasets for later analysis
df = pd.DataFrame.from_records([
    {
        'article_title': x.title,
        'article_uuid': x.article_uuid,
        'article_url': x.url,
        'article_description': x.description,
        'source_id': x.source_id,
        'published_at': x.published_at,
        'named_entities': x.named_entities,
        'raw_content': x.raw_content,
    } for x in articles if x.title is not None and x.description is not None
]).drop_duplicates(subset='article_url').reset_index(drop=True)

tmp = tempfile.NamedTemporaryFile()
with open(tmp.name, 'w') as f:
    df.to_csv(tmp.name, sep='\t', encoding='utf-8', index=False)
date_str = from_date.strftime('%Y-%m-%d')
datasets.put(tmp.name, f'/input/article_content/{date_str}.csv')
Пример #8
0
def save_dataframe_to_datasets(df, parsed_date):
    tmp = tempfile.NamedTemporaryFile()
    with open(tmp.name, 'w') as f:
        df.to_csv(tmp.name, sep='\t', encoding='utf-8', index=False)
    date_str = parsed_date.strftime('%Y-%m-%d')
    datasets.put(tmp.name, f'/input/article_content/{date_str}.csv')