def test_read_all_single_file(self): file_name, expected_data = write_data(5) assert len(expected_data) == 5 with TestPipeline() as pipeline: pcoll = pipeline | 'Create' >> Create( [file_name]) |'ReadAll' >> ReadAllFromText() assert_that(pcoll, equal_to(expected_data))
def run(argv=None): # argument parser parser = argparse.ArgumentParser() # pipeline options, google_cloud_options known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) setup_options = pipeline_options.view_as(SetupOptions) setup_options.save_main_session = True p = beam.Pipeline(options=pipeline_options) p1 = p | 'trigger from pubsub' >> beam.io.ReadFromPubSub(topic='projects/PROJECT_ID/topics/TOPIC_NAME_1') \ | "convert msg to dict" >> beam.Map(lambda x: json.loads(x)) \ | "extract filename" >> beam.Map(lambda x : 'gs://{}/{}'.format(x['bucket'], x['name'])) \ | "read file" >> ReadAllFromText() \ | 'split' >> beam.Map(lambda x: x.split(',')) \ | 'format to dict' >> beam.Map(lambda x: {"id": x[0], "name": x[1]}) # Write the messages output_rec = p1 | 'write to BigQuery' >> WriteToBigQuery( 'PROJECT_ID:DATASET_ID.TABLE_NAME', schema='id:INTEGER, name:STRING', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) result = p.run() result.wait_until_finish()
def test_read_all_file_pattern(self): pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4]) assert len(expected_data) == 40 with TestPipeline() as pipeline: pcoll = (pipeline | 'Create' >> Create([pattern]) |'ReadAll' >> ReadAllFromText()) assert_that(pcoll, equal_to(expected_data))
def test_read_all_with_filename(self): pattern, expected_data = write_pattern([5, 3], return_filenames=True) assert len(expected_data) == 8 with TestPipeline() as pipeline: pcoll = (pipeline | 'Create' >> Create([pattern]) | 'ReadAll' >> ReadAllFromText(with_filename=True)) assert_that(pcoll, equal_to(expected_data))
def test_read_all_gzip(self): _, lines = write_data(100) with TempDir() as tempdir: file_name = tempdir.create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines).encode('utf-8')) with TestPipeline() as pipeline: pcoll = (pipeline | Create([file_name]) | 'ReadAll' >> ReadAllFromText( compression_type=CompressionTypes.GZIP)) assert_that(pcoll, equal_to(lines))
def test_read_all_gzip(self): _, lines = write_data(100) file_name = self._create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = (pipeline | Create([file_name]) | 'ReadAll' >> ReadAllFromText(compression_type=CompressionTypes.GZIP)) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_all_many_file_patterns(self): pattern1, expected_data1 = write_pattern([5, 3, 12, 8, 8, 4]) assert len(expected_data1) == 40 pattern2, expected_data2 = write_pattern([3, 7, 9]) assert len(expected_data2) == 19 pattern3, expected_data3 = write_pattern([11, 20, 5, 5]) assert len(expected_data3) == 41 expected_data = [] expected_data.extend(expected_data1) expected_data.extend(expected_data2) expected_data.extend(expected_data3) with TestPipeline() as pipeline: pcoll = pipeline | 'Create' >> Create( [pattern1, pattern2, pattern3]) |'ReadAll' >> ReadAllFromText() assert_that(pcoll, equal_to(expected_data))
def test_read_all_many_single_files(self): file_name1, expected_data1 = write_data(5) assert len(expected_data1) == 5 file_name2, expected_data2 = write_data(10) assert len(expected_data2) == 10 file_name3, expected_data3 = write_data(15) assert len(expected_data3) == 15 expected_data = [] expected_data.extend(expected_data1) expected_data.extend(expected_data2) expected_data.extend(expected_data3) with TestPipeline() as pipeline: pcoll = pipeline | 'Create' >> Create( [file_name1, file_name2, file_name3]) |'ReadAll' >> ReadAllFromText() assert_that(pcoll, equal_to(expected_data))
def test_read_all_unavailable_files_ignored(self): file_name1, expected_data1 = write_data(5) assert len(expected_data1) == 5 file_name2, expected_data2 = write_data(10) assert len(expected_data2) == 10 file_name3, expected_data3 = write_data(15) assert len(expected_data3) == 15 file_name4 = "/unavailable_file" expected_data = [] expected_data.extend(expected_data1) expected_data.extend(expected_data2) expected_data.extend(expected_data3) with TestPipeline() as pipeline: pcoll = (pipeline | 'Create' >> Create( [file_name1, file_name2, file_name3, file_name4]) |'ReadAll' >> ReadAllFromText()) assert_that(pcoll, equal_to(expected_data))
wc[word] += 1 def splitLines(line): text = nltk.word_tokenize(line.lower().strip("\r\n")) filtered_sentence = [w for w in text if not w in stop_words] return filtered_sentence from sklearn.feature_extraction.text import HashingVectorizer vectorizer = HashingVectorizer(strip_accents='unicode' , stop_words="stop_words") def hashingVector(doc): reutrn vectorizer.fit_transform(doc) (books | "Read Files" >> ReadAllFromText() | "Split Lines" >> beam.ParDo(splitLines) | "Clean Words" >> beam.ParDo(wordClean) | "Count Words" >> beam.ParDo(wordCount) ) result = pipeline.run() result.wait_until_finish() import pandas as pd df = pd.DataFrame({"word" : list(wc.keys()) , "count" : list(wc.values())}) #df.to_json("./test.json" , orient="records") #df.sort_values("y" , ascending=False).iloc[:500].to_json("./test.json" , orient="records") df.to_gbq("nlp.wordcounts" , "dsba6155" , if_exists='append')