def get_pagina_result(url, link): PAGINA_scraper = AutoScraper() PAGINA_scraper.load('./' + url + '-search.json') result = PAGINA_scraper.get_result_similar(link, unique=True, group_by_alias=True) return _aggregate_result(result)
def auto(self, url, model_name='1'): scraper = AutoScraper() scraper.load(model_name) html_ = requests.get(url) html_.encoding = html_.apparent_encoding html_ = html_.text data = scraper.get_result_similar(url, html=html_, group_by_alias=True) return data
def autoscraper(): link = request.json["Link"] global url url = request.json["Link"] wanted_list = request.json["Metodo"] global scraper scraper = AutoScraper() wanted_dict = { 'url': [ 'https://www.rosario3.com/policiales/Robaron-dos-autos-de-alta-gama-de-una-concesionaria-y-los-encontraron-en-un-galpon-20201014-0080.html', 'https://www.rosario3.com/-economia-negocios-agro-/La-inflacion-de-septiembre-fue-del-28-segun-el-Indec-20201014-0087.html', 'https://www.rosario3.com/informaciongeneral/Coronavirus-confirmaron-el-primer-caso-de-reinfeccion-en-Rosario-20201014-0030.html' ] } scraper.build(url=link, wanted_dict=wanted_dict) dict = scraper.get_result_similar(link, grouped=True) regla = [] [regla.extend([k]) for k in dict.keys()] #data = get_pagina_result(url, link) #json_format = json.dumps(data, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False) return regla
from autoscraper import AutoScraper # AutoScraper must be installed with # pip install git+https://github.com/alirezamika/autoscraper.git question = "france" time = "year" url = f"https://www.quora.com/search?q={question}&time={time}" model_name = "model_quora" scraper = AutoScraper() scraper.load(f"./{model_name}") results = scraper.get_result_similar(url) # if no results if results: for r in results: print(r) else: print("No result found")
# -*- coding: utf-8 -*- """ Created on Sat Apr 24 12:25:41 2021 @author: win10 """ from autoscraper import AutoScraper amazon_url="https://www.amazon.in/s?k=iphones" wanted_list=["₹58,400","New Apple iPhone 11 (128GB) - Black"] scraper=AutoScraper() result=scraper.build(amazon_url,wanted_list) print(scraper.get_result_similar(amazon_url,grouped=True))
# -*- coding: utf-8 -*- """ Created on Sun Apr 25 22:30:42 2021 @author: Nikhil Reddy """ from autoscraper import AutoScraper Scrap = AutoScraper() amzn_url = "https://www.amazon.in/s?k=iphones" req_list_amzn = ["₹58,400", "New Apple iPhone 11 (128GB) - Black"] Scrap_amzn = Scrap.build(amzn_url, req_list_amzn) res_amzn = Scrap.get_result_similar(amzn_url, grouped=True) dyk = list(res_amzn.keys()) print(dyk) Scrap.set_rule_aliases({dyk[len(dyk) - 1]: 'Title', dyk[0]: 'Price'}) Scrap.keep_rules([dyk[len(dyk) - 1], dyk[0]]) Scrap.save('amazon-search3')
standard_ebooks_url, ["/ebooks/?page=2", "/ebooks/?page=3"]) pages_urls = Seq(scraped_pages_urls).pipe( seq.map(lambda page: urljoin(standard_ebooks_url, page)), ) # Page Scraper page_scraper = AutoScraper() books_urls = page_scraper.build(standard_ebooks_url, [ "/ebooks/ford-madox-ford/some-do-not", "/ebooks/booth-tarkington/the-turmoil", "/ebooks/anatole-france/penguin-island/a-w-evans", "/ebooks/edgar-allan-poe/the-narrative-of-arthur-gordon-pym-of-nantucket" ], update=True) for page in pages_urls: print(page) urls = page_scraper.get_result_similar(page) books_urls = list(set(books_urls + urls)) books_urls = Seq(books_urls).pipe( seq.map( lambda book_path: urljoin("https://standardebooks.org", book_path))) # Book Scraper book_scraper = AutoScraper() azw3s_with_thumbnails = [ "/ebooks/henry-fielding/the-history-of-tom-jones-a-foundling/downloads/henry-fielding_the-history-of-tom-jones-a-foundling.azw3", "/ebooks/henry-fielding/the-history-of-tom-jones-a-foundling/downloads/thumbnail_4e2aa05093a56fecc2e1f7e015e8aa2967f56208_EBOK_portrait.jpg", ] books_assets = book_scraper.build( "https://standardebooks.org/ebooks/henry-fielding/the-history-of-tom-jones-a-foundling", azw3s_with_thumbnails,
def url_data(): about() st.info("This feature has limited functionality") url=st.text_input("Webpage URL",help="Enter a url where your data is placed") if url=="": st.info("Please enter a valid input to get started") st.stop() #getting data Column names as user input column_name=st.text_input("enter candidadte column Name",key="value") value_list=column_name.split(",") #getting data example for refferances candidate=st.text_input("Candidate example value",key="candidates",help="use ; as seperator to enter another value") items_list=candidate.split(";") #st.write(items) # create object scraper = AutoScraper() # feeding for scraping final_result = scraper.build(url,items_list) # display result results=scraper.get_result_similar(url,grouped=True,keep_order=True) result={} for key,value in results.items(): if value not in result.values(): result[key]=value orient_df=pd.DataFrame.from_dict(result,orient="index") df=orient_df.transpose() df.columns=value_list df.fillna(value=pd.np.nan,inplace=True) st.write(df) cols=df.columns.tolist() col1,col2=st.beta_columns(2) target=col1.selectbox("Select Target", cols,key="target") typelist=['binary','multiclass','regression','time series regression','time series multiclass','time series binary'] p_type=col2.selectbox("Select problem type",typelist,key="p_type") st.write("hey") x=df.drop(columns=target) y=df[target] x_train,x_test,y_train,y_test=evalml.preprocessing.split_data(x,y,problem_type=p_type) automl = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type) automl.search() rank=automl.rankings #checking best pipeline ############################################################### best_pipeline=automl.best_pipeline description=automl.describe_pipeline(automl.rankings.iloc[0]["id"]) ### OPtimize the code ### Evaluate on hold out data problem_list=['binary','time series binary'] problem_list2=['multiclass','time series multiclass'] cola,col_b,colc=st.beta_columns(3) if p_type in problem_list: objective=col_b.selectbox("select objective",objectives().binary_obj,key="objective selector") best_pipeline.score(x_test, y_test, objectives=["auc","f1","Precision","Recall"]) automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type, objective=objective, additional_objectives=['f1', 'precision'], max_batches=1, optimize_thresholds=True) automl_tunned.search() tunned_rankings=automl_tunned.rankings tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True) tunned_pipeline= automl_tunned.best_pipeline tunned_pipeline.score(x_test, y_test, objectives=[objective]) pred=tunned_pipeline.predict_proba(x_test).to_dataframe() # for multiclass type problem elif p_type in problem_list2: objective=col_b.selectbox("select objective",objectives().multiclass_obj,key="objective selector") best_pipeline.score(x_test, y_test, objectives=["log loss multiclass","MCC multiclass","accuracy multiclass"]) automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type, objective=objective, additional_objectives=['MCC multiclass', 'accuracy multiclass'], max_batches=1, optimize_thresholds=True) automl_tunned.search() tunned_rankings=automl_tunned.rankings tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True) tunned_pipeline= automl_tunned.best_pipeline tunned_pipeline.score(x_test, y_test, objectives=[objective]) pred=tunned_pipeline.predict(x_test).to_series() # for regression type problems else: objective=col_b.selectbox("select objective",objectives().regression_obj,key="objective selector") best_pipeline.score(x_test, y_test, objectives=["r2","MSE","MAE","Root Mean Squared Error"]) automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type, objective=objective, additional_objectives=['Root Mean Squared Error', 'MSE','MAE'], max_batches=1, optimize_thresholds=True) automl_tunned.search() tunned_rankings=automl_tunned.rankings tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True) tunned_pipeline= automl_tunned.best_pipeline tunned_pipeline.score(x_test, y_test, objectives=[objective]) tunned_pipeline.fit(x_train,y_train) pred=tunned_pipeline.predict(x_test).to_series() file=open("model_details.txt","w") str_dict=repr(tunned_description) file.write(str_dict) file.close() def get_binary_file_downloader_html(bin_file, file_label='File'): with open(bin_file, 'rb') as f: data = f.read() bin_str = base64.b64encode(data).decode() href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">Click Here To Download {file_label}</a>' return href col1,col2,col3=st.beta_columns([1,1,1]) if col2.button("Predict Results",key="output",help="shows results"): st.spinner() with st.spinner(text='In progress'): st.info("Wait while we are selecting a best algoritham for your problem..Hold your breath.") time.sleep(20) st.info("Done. Here you go.") st.write(pred) col11,col12=st.beta_columns([3,1]) with col11: with st.beta_expander("Compare Models"): st.write(tunned_rankings) with col12: with st.beta_expander("Best Pipeline"): st.success(tunned_pipeline) st.markdown(get_binary_file_downloader_html('model_details.txt', 'Pipeline Details'), unsafe_allow_html=True)
urlMainSearch =urlMainSearch+keyword1 urlMainSearch =urlMainSearch+'+'+keyword2 #urlMainSearch =urlMainSearch+'+'+keyword3 urlMainSearch =urlMainSearch+'&ref=pagination&page=' urlMainSearch =urlMainSearch+str(pageNum) urlMainSearch ='https://www.etsy.com/search?q=3d+printing&ref=pagination&page=2' #'https://www.etsy.com/search?q=3d+printing' #'https://www.etsy.com/search?q=3d+printing&ref=pagination&page=2' print(urlMainSearch) scraperMain = AutoScraper() scraperMain.load('etsyMain') print('here1') resultsMain = scraperMain.get_result_similar(urlMainSearch, contain_sibling_leaves=True)#, keep_order=True) group_by_alias=True) #attr_fuzz_ratio=0.8) print(resultsMain) loop =1 print('here2') for listingUrl in resultsMain: print('loop#' +loop) loop = loop+1 outF = open(scrapeRecord,"a")#, "a") #a for append. scraped= scraperitemFavorites.get_result_similar(listingUrl)#,grouped=True group_by_alias=True)#,keep_order=True)# contain_sibling_leaves=True) if len(scraped)==0: #if not scraped: saveMe='0' else: saveMe=';'.join([str(elem) for elem in scraped]) saveMe=saveMe.replace('\n','')
search = "iphone+11+silver" amazon_url="https://www.amazon.in/s?k={}&s=price-desc-rank".format(search) print(amazon_url) """# Defining what data I want """ wanted_list=["https://m.media-amazon.com/images/I/71umuN8XVeL._AC_UY218_.jpg","New Apple iPhone 12 Pro Max (128GB) - Silver","1,25,900","501"] """# Creating scraper object""" scraper=AutoScraper() result=scraper.build(amazon_url,wanted_list) """# Finding similar data""" data = scraper.get_result_similar(amazon_url,grouped=True) print(data) keys = list(data.keys()) print(keys) """# Defining alias""" scraper.set_rule_aliases({str(keys[0]):'ImageUrl',str(keys[2]):'Title',str(keys[-2]):'Price',str(keys[-1]):'Reviews'}) scraper.save("amazon_in.json") """# Testing for other search word""" amazon_scraper = AutoScraper() amazon_scraper.load('amazon_in.json')