def log_artifact(self, local_file, artifact_path=None): if artifact_path is None: artifact_path = "./" dest_path = posixpath.join(artifact_path, os.path.basename(local_file)) datasets_path = self._datasets_path(dest_path) datasets.put(local_file, datasets_path, self.project_id)
def test_put_directory(mocker, mock_client): posixpath_dirname_mock = mocker.patch("posixpath.dirname", return_value="/") os_path_isdir_mock = mocker.patch("os.path.isdir", side_effect=[True, False]) entry_mock = "test-file" os_listdir_mock = mocker.patch("os.listdir", return_value=[entry_mock]) _put_file_mock = mocker.patch("faculty.datasets._put_file") datasets.put("local-path", "project-path", PROJECT_ID) posixpath_dirname_mock.assert_called_once_with("project-path") mock_client.create_directory.assert_has_calls([ mocker.call(PROJECT_ID, "/", parents=True), mocker.call(PROJECT_ID, "project-path"), ]) os_path_isdir_mock.assert_has_calls( [mocker.call("local-path"), mocker.call("local-path/test-file")]) os_listdir_mock.assert_called_once_with("local-path") _put_file_mock.assert_called_once_with( "local-path/test-file", "project-path/test-file", PROJECT_ID, mock_client, )
def fetch_and_save_articles_in_date_range(from_date, to_date): # Fetch raw articles raw_articles = RawArticle.get_raw_articles(from_date.strftime('%Y-%m-%d'), to_date.strftime('%Y-%m-%d')) # Build articles and insert into database articles = Article.build_articles(raw_articles) # Article.bulk_insert(articles) # Store raw article content in datasets for later analysis df = pd.DataFrame.from_records([ { 'article_title': x.title, 'article_uuid': x.article_uuid, 'article_url': x.url, 'article_description': x.description, 'source_id': x.source_id, 'published_at': x.published_at, 'named_entities': x.named_entities, 'raw_content': x.raw_content, } for x in articles if x.title is not None and x.description is not None ]).drop_duplicates(subset='article_url').reset_index(drop=True) tmp = tempfile.NamedTemporaryFile() with open(tmp.name, 'w') as f: df.to_csv(tmp.name, sep='\t', encoding='utf-8', index=False) date_str = from_date.strftime('%Y-%m-%d') datasets.put(tmp.name, f'/input/article_content/{date_str}.csv')
def test_put_file(mocker, mock_client): posixpath_dirname_mock = mocker.patch("posixpath.dirname", return_value="/") os_path_isdir_mock = mocker.patch("os.path.isdir", return_value=False) upload_mock = mocker.patch("faculty.datasets.transfer.upload_file") datasets.put("local-path", "project-path", PROJECT_ID) posixpath_dirname_mock.assert_called_once_with("project-path") mock_client.create_directory.assert_called_once_with(PROJECT_ID, "/", parents=True) os_path_isdir_mock.assert_called_once_with("local-path") upload_mock.assert_called_once_with(mock_client, PROJECT_ID, "project-path", "local-path")
def save_data_frame(df): remove_file(TMP_CSV) df.to_csv(TMP_CSV, sep='\t', encoding='utf-8', index=False) datasets.put(TMP_CSV, TODAY_CSV)
def log_artifacts(self, local_dir, artifact_path=None): if artifact_path is None: artifact_path = "./" datasets_path = self._datasets_path(artifact_path) datasets.put(local_dir, datasets_path, self.project_id)
from_date = datetime.today().date() - timedelta(days=1) to_date = datetime.today().date() # Fetch raw articles raw_articles = RawArticle.get_raw_articles(from_date.strftime('%Y-%m-%d'), to_date.strftime('%Y-%m-%d')) # Build articles and insert into database articles = Article.build_articles(raw_articles) # Article.bulk_insert(articles) # Store raw article content in datasets for later analysis df = pd.DataFrame.from_records([ { 'article_title': x.title, 'article_uuid': x.article_uuid, 'article_url': x.url, 'article_description': x.description, 'source_id': x.source_id, 'published_at': x.published_at, 'named_entities': x.named_entities, 'raw_content': x.raw_content, } for x in articles if x.title is not None and x.description is not None ]).drop_duplicates(subset='article_url').reset_index(drop=True) tmp = tempfile.NamedTemporaryFile() with open(tmp.name, 'w') as f: df.to_csv(tmp.name, sep='\t', encoding='utf-8', index=False) date_str = from_date.strftime('%Y-%m-%d') datasets.put(tmp.name, f'/input/article_content/{date_str}.csv')
def save_dataframe_to_datasets(df, parsed_date): tmp = tempfile.NamedTemporaryFile() with open(tmp.name, 'w') as f: df.to_csv(tmp.name, sep='\t', encoding='utf-8', index=False) date_str = parsed_date.strftime('%Y-%m-%d') datasets.put(tmp.name, f'/input/article_content/{date_str}.csv')