Пример #1
0
def get_pagina_result(url, link):
    PAGINA_scraper = AutoScraper()
    PAGINA_scraper.load('./' + url + '-search.json')
    result = PAGINA_scraper.get_result_similar(link,
                                               unique=True,
                                               group_by_alias=True)
    return _aggregate_result(result)
Пример #2
0
 def auto(self, url, model_name='1'):
     scraper = AutoScraper()
     scraper.load(model_name)
     html_ = requests.get(url)
     html_.encoding = html_.apparent_encoding
     html_ = html_.text
     data = scraper.get_result_similar(url, html=html_, group_by_alias=True)
     return data
Пример #3
0
def autoscraper():
    link = request.json["Link"]
    global url
    url = request.json["Link"]
    wanted_list = request.json["Metodo"]
    global scraper
    scraper = AutoScraper()
    wanted_dict = {
        'url': [
            'https://www.rosario3.com/policiales/Robaron-dos-autos-de-alta-gama-de-una-concesionaria-y-los-encontraron-en-un-galpon-20201014-0080.html',
            'https://www.rosario3.com/-economia-negocios-agro-/La-inflacion-de-septiembre-fue-del-28-segun-el-Indec-20201014-0087.html',
            'https://www.rosario3.com/informaciongeneral/Coronavirus-confirmaron-el-primer-caso-de-reinfeccion-en-Rosario-20201014-0030.html'
        ]
    }
    scraper.build(url=link, wanted_dict=wanted_dict)
    dict = scraper.get_result_similar(link, grouped=True)

    regla = []
    [regla.extend([k]) for k in dict.keys()]
    #data = get_pagina_result(url, link)
    #json_format = json.dumps(data, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False)
    return regla
Пример #4
0
from autoscraper import AutoScraper

# AutoScraper must be installed with
#  pip install git+https://github.com/alirezamika/autoscraper.git

question = "france"
time = "year"
url = f"https://www.quora.com/search?q={question}&time={time}"
model_name = "model_quora"

scraper = AutoScraper()
scraper.load(f"./{model_name}")
results = scraper.get_result_similar(url)

# if no results
if results:
    for r in results:
        print(r)
else:
    print("No result found")
Пример #5
0
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 24 12:25:41 2021

@author: win10
"""

from autoscraper import AutoScraper
amazon_url="https://www.amazon.in/s?k=iphones"

wanted_list=["₹58,400","New Apple iPhone 11 (128GB) - Black"]

scraper=AutoScraper()
result=scraper.build(amazon_url,wanted_list)

print(scraper.get_result_similar(amazon_url,grouped=True))



Пример #6
0
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 25 22:30:42 2021

@author: Nikhil Reddy
"""

from autoscraper import AutoScraper

Scrap = AutoScraper()

amzn_url = "https://www.amazon.in/s?k=iphones"

req_list_amzn = ["₹58,400", "New Apple iPhone 11 (128GB) - Black"]
Scrap_amzn = Scrap.build(amzn_url, req_list_amzn)
res_amzn = Scrap.get_result_similar(amzn_url, grouped=True)

dyk = list(res_amzn.keys())
print(dyk)
Scrap.set_rule_aliases({dyk[len(dyk) - 1]: 'Title', dyk[0]: 'Price'})
Scrap.keep_rules([dyk[len(dyk) - 1], dyk[0]])
Scrap.save('amazon-search3')
Пример #7
0
    standard_ebooks_url, ["/ebooks/?page=2", "/ebooks/?page=3"])
pages_urls = Seq(scraped_pages_urls).pipe(
    seq.map(lambda page: urljoin(standard_ebooks_url, page)), )

# Page Scraper
page_scraper = AutoScraper()
books_urls = page_scraper.build(standard_ebooks_url, [
    "/ebooks/ford-madox-ford/some-do-not",
    "/ebooks/booth-tarkington/the-turmoil",
    "/ebooks/anatole-france/penguin-island/a-w-evans",
    "/ebooks/edgar-allan-poe/the-narrative-of-arthur-gordon-pym-of-nantucket"
],
                                update=True)
for page in pages_urls:
    print(page)
    urls = page_scraper.get_result_similar(page)
    books_urls = list(set(books_urls + urls))
books_urls = Seq(books_urls).pipe(
    seq.map(
        lambda book_path: urljoin("https://standardebooks.org", book_path)))

# Book Scraper
book_scraper = AutoScraper()

azw3s_with_thumbnails = [
    "/ebooks/henry-fielding/the-history-of-tom-jones-a-foundling/downloads/henry-fielding_the-history-of-tom-jones-a-foundling.azw3",
    "/ebooks/henry-fielding/the-history-of-tom-jones-a-foundling/downloads/thumbnail_4e2aa05093a56fecc2e1f7e015e8aa2967f56208_EBOK_portrait.jpg",
]
books_assets = book_scraper.build(
    "https://standardebooks.org/ebooks/henry-fielding/the-history-of-tom-jones-a-foundling",
    azw3s_with_thumbnails,
Пример #8
0
def url_data():
    about()
    st.info("This feature has limited functionality")
    url=st.text_input("Webpage URL",help="Enter a url where your data is placed")
    if url=="":
        st.info("Please enter a valid input to get started")
        st.stop()
    
    #getting data Column names as user input
    column_name=st.text_input("enter candidadte column Name",key="value")
    value_list=column_name.split(",")
    
    #getting data example for refferances
    candidate=st.text_input("Candidate example value",key="candidates",help="use ; as seperator to enter another value")
    items_list=candidate.split(";")
    #st.write(items)
    
# create object
    scraper = AutoScraper()
# feeding for scraping
    final_result = scraper.build(url,items_list)
# display result
    
    
    results=scraper.get_result_similar(url,grouped=True,keep_order=True)
    result={}
    for key,value in results.items():
        if value not in result.values():
            result[key]=value
            
    orient_df=pd.DataFrame.from_dict(result,orient="index")
    df=orient_df.transpose()
    
    df.columns=value_list
    df.fillna(value=pd.np.nan,inplace=True)
    st.write(df)
    
    cols=df.columns.tolist()
    col1,col2=st.beta_columns(2)
 
    target=col1.selectbox("Select Target", cols,key="target")


    
    typelist=['binary','multiclass','regression','time series regression','time series multiclass','time series binary']
    p_type=col2.selectbox("Select problem type",typelist,key="p_type")     
    st.write("hey")
    x=df.drop(columns=target)
    y=df[target]
    x_train,x_test,y_train,y_test=evalml.preprocessing.split_data(x,y,problem_type=p_type)

    automl = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type)
    automl.search()


    rank=automl.rankings

#checking best pipeline     ###############################################################

    best_pipeline=automl.best_pipeline
    description=automl.describe_pipeline(automl.rankings.iloc[0]["id"])

### OPtimize the code 


### Evaluate on hold out data
    problem_list=['binary','time series binary']
    problem_list2=['multiclass','time series multiclass']

    cola,col_b,colc=st.beta_columns(3)
    
    if p_type in problem_list:
        objective=col_b.selectbox("select objective",objectives().binary_obj,key="objective selector")  
        best_pipeline.score(x_test, y_test, objectives=["auc","f1","Precision","Recall"])

        automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['f1', 'precision'],
                                         max_batches=1,
                                         optimize_thresholds=True)

        automl_tunned.search()

        tunned_rankings=automl_tunned.rankings

        tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

        tunned_pipeline= automl_tunned.best_pipeline

        tunned_pipeline.score(x_test, y_test,  objectives=[objective])

        pred=tunned_pipeline.predict_proba(x_test).to_dataframe()


# for multiclass type problem
    elif p_type in problem_list2:
        objective=col_b.selectbox("select objective",objectives().multiclass_obj,key="objective selector") 
        best_pipeline.score(x_test, y_test, objectives=["log loss multiclass","MCC multiclass","accuracy multiclass"])

        automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['MCC multiclass', 'accuracy multiclass'],
                                         max_batches=1,
                                         optimize_thresholds=True)

        automl_tunned.search()

        tunned_rankings=automl_tunned.rankings

        tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

        tunned_pipeline= automl_tunned.best_pipeline

        tunned_pipeline.score(x_test, y_test,  objectives=[objective])

        pred=tunned_pipeline.predict(x_test).to_series()

    
# for regression type problems
    else:
                objective=col_b.selectbox("select objective",objectives().regression_obj,key="objective selector") 
                best_pipeline.score(x_test, y_test, objectives=["r2","MSE","MAE","Root Mean Squared Error"])
                automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['Root Mean Squared Error', 'MSE','MAE'],
                                         max_batches=1,
                                         optimize_thresholds=True)

                automl_tunned.search()

                tunned_rankings=automl_tunned.rankings

                tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

                tunned_pipeline= automl_tunned.best_pipeline

                tunned_pipeline.score(x_test, y_test,  objectives=[objective])

                tunned_pipeline.fit(x_train,y_train)
                    
                pred=tunned_pipeline.predict(x_test).to_series()
                
    file=open("model_details.txt","w")
    str_dict=repr(tunned_description)
    file.write(str_dict)
    file.close()
    def get_binary_file_downloader_html(bin_file, file_label='File'):
            with open(bin_file, 'rb') as f:
                data = f.read()
                bin_str = base64.b64encode(data).decode()
                href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">Click Here To Download {file_label}</a>'
                return href                
    col1,col2,col3=st.beta_columns([1,1,1])        
    if col2.button("Predict Results",key="output",help="shows results"):
            st.spinner()
            with st.spinner(text='In progress'):
                 st.info("Wait while we are selecting a best algoritham for your problem..Hold your breath.")
                 time.sleep(20)
            st.info("Done. Here you go.")
            st.write(pred)

    col11,col12=st.beta_columns([3,1])
    with col11:
        with st.beta_expander("Compare Models"):
                st.write(tunned_rankings)
        
    with col12:
        with st.beta_expander("Best Pipeline"):
                st.success(tunned_pipeline)
                st.markdown(get_binary_file_downloader_html('model_details.txt', 'Pipeline Details'), unsafe_allow_html=True)
Пример #9
0
urlMainSearch =urlMainSearch+keyword1
urlMainSearch =urlMainSearch+'+'+keyword2
#urlMainSearch =urlMainSearch+'+'+keyword3
urlMainSearch =urlMainSearch+'&ref=pagination&page='
urlMainSearch =urlMainSearch+str(pageNum)

urlMainSearch ='https://www.etsy.com/search?q=3d+printing&ref=pagination&page=2'

#'https://www.etsy.com/search?q=3d+printing'
         #'https://www.etsy.com/search?q=3d+printing&ref=pagination&page=2'

print(urlMainSearch)
scraperMain = AutoScraper()
scraperMain.load('etsyMain')
print('here1')
resultsMain = scraperMain.get_result_similar(urlMainSearch, contain_sibling_leaves=True)#, keep_order=True) group_by_alias=True)
#attr_fuzz_ratio=0.8)
print(resultsMain)
loop =1
print('here2')
for listingUrl in resultsMain:
    print('loop#' +loop)
    loop = loop+1
    outF = open(scrapeRecord,"a")#, "a") #a for append.

    scraped= scraperitemFavorites.get_result_similar(listingUrl)#,grouped=True group_by_alias=True)#,keep_order=True)# contain_sibling_leaves=True)
    if len(scraped)==0: #if not scraped:
        saveMe='0'
    else:
        saveMe=';'.join([str(elem) for elem in scraped])
        saveMe=saveMe.replace('\n','')
Пример #10
0
search = "iphone+11+silver"
amazon_url="https://www.amazon.in/s?k={}&s=price-desc-rank".format(search)
print(amazon_url)

"""# Defining what data I want """

wanted_list=["https://m.media-amazon.com/images/I/71umuN8XVeL._AC_UY218_.jpg","New Apple iPhone 12 Pro Max (128GB) - Silver","1,25,900","501"]

"""# Creating scraper object"""

scraper=AutoScraper()
result=scraper.build(amazon_url,wanted_list)

"""# Finding similar data"""

data = scraper.get_result_similar(amazon_url,grouped=True)
print(data)

keys = list(data.keys())
print(keys)

"""# Defining alias"""

scraper.set_rule_aliases({str(keys[0]):'ImageUrl',str(keys[2]):'Title',str(keys[-2]):'Price',str(keys[-1]):'Reviews'})

scraper.save("amazon_in.json")

"""# Testing for other search word"""

amazon_scraper = AutoScraper()
amazon_scraper.load('amazon_in.json')