def test_array(): inp = '<baz><foo>bar</foo><foo>bar2</foo></baz>' mp = [{'column': 'foo', 'path': 'foo', 'array': True}] et = lxml.etree.fromstring(inp) assert extract(et, mp) == {'foo': ['bar', 'bar2']} with pytest.raises(ValueError): del mp[0]['array'] extract(et, mp)
def test_atts(): inp = '<baz><foo a="b" c="d">abc</foo></baz>' mp = [{'column': 'foo', 'path': 'foo', 'attrs': {'a': 'b', 'c': 'd'}}] et = lxml.etree.fromstring(inp) assert extract(et, mp) == {'foo': 'abc'} with pytest.raises(KeyError): mp[0]['attrs'] = {'b': 'c'} et = lxml.etree.fromstring(inp) extract(et, mp)
def test_extract(spark_session: SparkSession) -> None: in_path = "tests/unit/some_text_file.txt" expected_df = spark_session.createDataFrame([ "hello world world", "hello world", "", "test", ], schema=StringType()) df = main.extract(spark_session, in_path) assert df.collect() == expected_df.collect()
def test_nested_and_array(): inp = '<baz><foo><bak>a1</bak><bak>a2</bak></foo></baz>' mp = [{ 'column': 'foo', 'path': 'foo', 'mapping': [ { 'column': 'bak', 'path': 'bak', 'array': True }, ] }] et = lxml.etree.fromstring(inp) assert extract(et, mp) == {'foo': {'bak': ['a1', 'a2']}}
def test_attr_and_nested(): inp = '<baz><foo prop="abc">bar</foo></baz>' mp = [{ 'column': 'bak', 'path': 'foo', 'mapping': [ { 'column': 'nested', 'path': None }, { 'column': 'nested2', 'path': '#prop' }, ] }] et = lxml.etree.fromstring(inp) assert extract(et, mp) == {'bak': {'nested': 'bar', 'nested2': 'abc'}}
def UploadManager(filename): #blob = BlobManager(config_blob.BLOB_NAME,config_blob.BLOB_KEY) if get_name_of_filepath(filename)!='NOT A WORD DOCUMENT': print(filename) #blob.upload_from_bytes(data,filename,config_blob.BLOB_CONTAINER) #print('blob connection') #directory_output=os.path.split(output_directorypath)[1] #print(directory_output) #root_output=os.path.split(output_directorypath)[0] #print(root_output) #blob.download(output_directorypath,filename,directory_output, config_blob.BLOB_CONTAINER) #blob.download_all_blobs(root_output,directory_output,config_blob.BLOB_CONTAINER) main.blob_download() logging.getLogger().setLevel(logging.INFO) # Extract data from upstream. observations = main.extract() # Spacy: Spacy NLP nlp = spacy.load('en') # Transform data to have appropriate fields observations, nlp = main.transform(observations, nlp) # Load data for downstream consumption main.load(observations, nlp) main.load_to_json(observations, nlp) #Send data to CosmosDB main.send_to_Cosmos() print('sent to cosmos') print("Data is ready to be send to cosmos") #Create Azure search Datasource, Index and indexer main.implement_Azure_search() else: print('error') return 'NOT A WORD DOCUMENT'
def UploadManager_path(filepath, output_directorypath): blob = BlobManager(config_blob.BLOB_NAME,config_blob.BLOB_KEY) if get_name_of_filepath(filepath)!='NOT A WORD DOCUMENT': print(filepath) path=os.path.split(get_name_of_filepath(filepath))[0] filename=os.path.split(get_name_of_filepath(filepath))[1] blob.upload(path,filename,config_blob.BLOB_CONTAINER) print('blob connection') directory_output=os.path.split(output_directorypath)[1] print(directory_output) root_output=os.path.split(output_directorypath)[0] print(root_output) #blob.download(output_directorypath,filename,directory_output, config_blob.BLOB_CONTAINER) blob.download_all_blobs(root_output,directory_output,config_blob.BLOB_CONTAINER) logging.getLogger().setLevel(logging.INFO) # Extract data from upstream. observations = main.extract() # Spacy: Spacy NLP nlp = spacy.load('en') # Transform data to have appropriate fields observations, nlp = main.transform(observations, nlp) # Load data for downstream consumption main.load(observations, nlp) main.load_to_json(observations, nlp) #Send data to CosmosDB send_to_Cosmos() print('sent to cosmos') else: print('error' + filepath) return 'NOT A WORD DOCUMENT'
def main(audiofile, output): fs = [audiofile] highlightList = list( extract(fs, length=30, save_score=False, save_thumbnail=False, save_wav=False)) begin, end = highlightList[0] dur = librosa.get_duration(filename=audiofile) intervals = [ (0, begin), (begin, end), (end, dur), ] labels = [ "others", "chorus", "others", ] contents = np.array([(x[0], x[1], y) for x, y in zip(intervals, labels)], np.dtype("f, f, U16")) np.savetxt(output, contents, fmt=["%.2f", "%.2f", "%s"], delimiter="\t")
def test_basic(): inp = '<baz><foo>bar</foo></baz>' mp = [{'column': 'foo', 'path': 'foo'}] et = lxml.etree.fromstring(inp) assert extract(et, mp) == {'foo': 'bar'}
def test_empty(): inp = '<baz><foo>bar</foo></baz>' mp = [{'column': 'bak', 'path': 'bak'}] et = lxml.etree.fromstring(inp) assert extract(et, mp) == {'bak': None}
if __name__ == '__main__': parser = argparse.ArgumentParser(description='Extract accounts\' identifiers from pages.') parser.add_argument('url', help='url to parse') parser.add_argument('--cookies', default='', help='cookies to make http requests with auth') parser.add_argument('--debug', action='store_true', help='log debug information') parser.add_argument('--file', action='store_true', help='load from file instead of URL') args = parser.parse_args() log_level = logging.INFO if not args.debug else logging.DEBUG logging.basicConfig(level=log_level, format='-'*40 + '\n%(levelname)s: %(message)s') if not args.file: url = args.url page, status = parse(url, args.cookies) if status != 200: logging.info('Answer code {}, something went wrong'.format(status)) else: page = open(args.url).read() info = extract(page) if not info: sys.exit() logging.info('Result\n' + '-'*40) for key, value in info.items(): print('%s: %s' % (key, value))