Python AutoScraper.AutoScraper示例，autoscraper.AutoScraper.AutoScraper Python示例

示例#1

0

显示文件

def get_pagina_result(url, link):
    PAGINA_scraper = AutoScraper()
    PAGINA_scraper.load('./' + url + '-search.json')
    result = PAGINA_scraper.get_result_similar(link,
                                               unique=True,
                                               group_by_alias=True)
    return _aggregate_result(result)

示例#2

0

显示文件

def getPrice():
    url = 'https://stackoverflow.com/questions/2081586/web-scraping-with-python'
    wanted_list = ["What are metaclasses in Python?"]

    scraper = AutoScraper()
    result = scraper.build(url, wanted_list)
    print(result)

示例#3

0

显示文件

文件： api_server.py 项目： ezeellena/AutoScraper

def autoscraper(Link=None, Metodo=None):
    url = request.args["Link"]
    link = request.args["Link"]
    Metodo = request.args["Metodo"]
    wanted_list = [Metodo]
    scraper = AutoScraper()
    scraper.build(link, wanted_list)
    dict = scraper.get_result_exact(link, unique=False, grouped=True)
    l = []
    [l.extend([k, v]) for k, v in dict.items()]
    regla = l[0]
    scraper.set_rule_aliases({regla: 'regla'})
    scraper.keep_rules([regla])
    url = url.replace("http:", "").replace("//", "").replace(".", "").replace(
        "www", "").replace("https:",
                           "").replace("/", "").replace("\n",
                                                        "").replace("-", "")
    scraper.save(url + '-search')
    data = get_pagina_result(url, link)
    json_format = json.dumps(data,
                             indent=4,
                             separators=(',', ': '),
                             sort_keys=True,
                             ensure_ascii=False)
    return json_format

示例#4

0

显示文件

 def auto(self, url, model_name='1'):
     scraper = AutoScraper()
     scraper.load(model_name)
     html_ = requests.get(url)
     html_.encoding = html_.apparent_encoding
     html_ = html_.text
     data = scraper.get_result_similar(url, html=html_, group_by_alias=True)
     return data

示例#5

0

显示文件

 def build(self, wanted_dict=None, model_name='1'):
     """
     url2autospider
     """
     html_ = self.html
     url = self.url
     scraper = AutoScraper()
     scraper.build(html=html_, wanted_dict=wanted_dict)
     # data = scraper.get_result_similar(url, html=html_, group_by_alias=True)
     scraper.save(model_name)

示例#6

0

显示文件

async def scrape(ctx, url: str, wanted_list: str):
    # url = ''
    # wanted_list = ['']
    botscraper = AutoScraper()
    print(type(url))
    print(type(wanted_list))
    scrape_result = botscraper.build(url, wanted_list)
    print(scrape_result)
    results_message = '\r\n'.join(['BriefHub bot has found results! ðŸš€',
                        f"Here is what I found for * {wanted_list} * on * {url} * : {str(scrape_result)}",
                        ':-)'])
     
    await ctx.send(results_message)

示例#7

0

显示文件

文件： autopan.py 项目： print-VarunSharma/Junior-Mining-Scraper

def autopan_bot():
    url = 'https://www.juniorminingnetwork.com/mining-topics/topic/drill-results.html'
    highgrade_scraper = 'high grade scraper'
    # We can add one or multiple candidates here.
    # You can also put urls here to retrieve urls.
    wanted_list = ['High Grade', 'High-Grade']
    botscraper = AutoScraper()
    highgrade_results = botscraper.build(url, wanted_list)
    print(highgrade_results)
    if (highgrade_results):
        for result in highgrade_results:
            print('BriefHub bot has found results! ðŸš€')
            print(highgrade_results)
    elif (highgrade_results == None):
        print("Hmmm, it doesn't look like we found anything")
        exit(-1)
    botscraper.save(highgrade_scraper)
    print(f"ðŸ’¿ > Save the model {highgrade_scraper}")

示例#8

0

显示文件

def autoscraper():
    link = request.json["Link"]
    global url
    url = request.json["Link"]
    wanted_list = request.json["Metodo"]
    global scraper
    scraper = AutoScraper()
    wanted_dict = {
        'url': [
            'https://www.rosario3.com/policiales/Robaron-dos-autos-de-alta-gama-de-una-concesionaria-y-los-encontraron-en-un-galpon-20201014-0080.html',
            'https://www.rosario3.com/-economia-negocios-agro-/La-inflacion-de-septiembre-fue-del-28-segun-el-Indec-20201014-0087.html',
            'https://www.rosario3.com/informaciongeneral/Coronavirus-confirmaron-el-primer-caso-de-reinfeccion-en-Rosario-20201014-0030.html'
        ]
    }
    scraper.build(url=link, wanted_dict=wanted_dict)
    dict = scraper.get_result_similar(link, grouped=True)

    regla = []
    [regla.extend([k]) for k in dict.keys()]
    #data = get_pagina_result(url, link)
    #json_format = json.dumps(data, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False)
    return regla

示例#9

0

显示文件

文件： flask_search.py 项目： vitthalkcontact/WEBSCRAPPING

# -*- coding: utf-8 -*-
"""
Created on Sat Apr 24 12:29:21 2021

@author: win10
"""
from autoscraper import AutoScraper
from flask import Flask, request

amazon_scraper = AutoScraper()
amazon_scraper.load('amazon-search')
app = Flask(__name__)


def get_amazon_result(search_query):
    url = 'https://www.amazon.in/s?k=%s' % search_query
    result = amazon_scraper.get_result_similar(url, group_by_alias=True)
    return _aggregate_result(result)


def _aggregate_result(result):
    final_result = []
    print(list(result.values())[0])
    for i in range(len(list(result.values())[0])):
        try:

            final_result.append({alias: result[alias][i] for alias in result})
        except:
            pass
    return final_result

示例#10

0

显示文件

文件： api_server.py 项目： umunusb1/tutorials-1

from autoscraper import AutoScraper
from flask import Flask, request

ebay_scraper = AutoScraper()
etsy_scraper = AutoScraper()
ebay_scraper.load('ebay-search')
etsy_scraper.load('etsy-search')
app = Flask(__name__)


def get_ebay_result(search_query):
    url = 'https://www.ebay.com/sch/i.html?_nkw=%s' % search_query
    result = ebay_scraper.get_result_similar(url, group_by_alias=True)
    return _aggregate_result(result)


def get_etsy_result(search_query):
    url = 'https://www.etsy.com/search?q=%s' % search_query
    result = etsy_scraper.get_result_similar(url, group_by_alias=True)
    result['url'] = [
        f'https://www.etsy.com/listing/{i}' for i in result['url']
    ]
    return _aggregate_result(result)


def _aggregate_result(result):
    final_result = []
    for i in range(len(list(result.values())[0])):
        final_result.append({alias: result[alias][i] for alias in result})
    return final_result

示例#11

0

显示文件

print('started scraping...')

data_dir = '/home/pi/Desktop/scraper/'
#scrapeRecord = os.path.join(data_dir,'scrapeRecord.csv') # location to save records of file send attempts for troubleshooting

urlMain = 'https://www.etsy.com/search?q=cosplay%20fire'
urlex = 'https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-2&organic_search_click=1&frs=1&col=1'
wanted_list = [urlex
               ]  #This is the most simple search type, just a one page input

MainLink = [
    ('https://www.etsy.com/search?q=cosplay%20fire', [
        'https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-2&organic_search_click=1&frs=1&col=1'
    ]),
]
scraperMain = AutoScraper()  #define a new scraper object
for targetUrl, wanted_list in MainLink:
    scraperMain.build(url=targetUrl, wanted_list=wanted_list)
    #scraperMain.build(urlMain, wanted_list) #build the contents of that scraper
scraperMain.save(
    'etsyMain'
)  #Saves this particular build of the scraper! (note, this is a local file, you can load it w/o having to regenerate very time!)

#Build a new batch of features to collect. Best to collect them seperately from each other so they dont cross wires!
itemFavorites = [
    ('https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-3&organic_search_click=1&frs=1',
     ['1525 favorites']),
]
scraperitemFavorites = AutoScraper()
for targetUrl, wanted_list in itemFavorites:
    scraperitemFavorites.build(

示例#12

0

显示文件

# -*- coding: utf-8 -*-
"""
Created on Sun Apr 25 22:30:42 2021

@author: Nikhil Reddy
"""

from autoscraper import AutoScraper

Scrap = AutoScraper()

amzn_url = "https://www.amazon.in/s?k=iphones"

req_list_amzn = ["₹58,400", "New Apple iPhone 11 (128GB) - Black"]
Scrap_amzn = Scrap.build(amzn_url, req_list_amzn)
res_amzn = Scrap.get_result_similar(amzn_url, grouped=True)

dyk = list(res_amzn.keys())
print(dyk)
Scrap.set_rule_aliases({dyk[len(dyk) - 1]: 'Title', dyk[0]: 'Price'})
Scrap.keep_rules([dyk[len(dyk) - 1], dyk[0]])
Scrap.save('amazon-search3')

示例#13

0

显示文件

文件： main.py 项目： s0kil/standardebooks_downloader

import os
import sys
import requests
from time import time as timer
from urllib.parse import urljoin, urlparse
from multiprocessing.pool import ThreadPool

from autoscraper import AutoScraper
from expression.core import pipe
from expression.collections import Seq, seq

standard_ebooks_url = "https://standardebooks.org/ebooks"

# Navigation Scraper
navigation_scraper = AutoScraper()
scraped_pages_urls = navigation_scraper.build(
    standard_ebooks_url, ["/ebooks/?page=2", "/ebooks/?page=3"])
pages_urls = Seq(scraped_pages_urls).pipe(
    seq.map(lambda page: urljoin(standard_ebooks_url, page)), )

# Page Scraper
page_scraper = AutoScraper()
books_urls = page_scraper.build(standard_ebooks_url, [
    "/ebooks/ford-madox-ford/some-do-not",
    "/ebooks/booth-tarkington/the-turmoil",
    "/ebooks/anatole-france/penguin-island/a-w-evans",
    "/ebooks/edgar-allan-poe/the-narrative-of-arthur-gordon-pym-of-nantucket"
],
                                update=True)
for page in pages_urls:
    print(page)

示例#14

0

显示文件

文件： app.py 项目： sahil-raj3/Automated-ML-Pipeline

def url_data():
    about()
    st.info("This feature has limited functionality")
    url=st.text_input("Webpage URL",help="Enter a url where your data is placed")
    if url=="":
        st.info("Please enter a valid input to get started")
        st.stop()
    
    #getting data Column names as user input
    column_name=st.text_input("enter candidadte column Name",key="value")
    value_list=column_name.split(",")
    
    #getting data example for refferances
    candidate=st.text_input("Candidate example value",key="candidates",help="use ; as seperator to enter another value")
    items_list=candidate.split(";")
    #st.write(items)
    
# create object
    scraper = AutoScraper()
# feeding for scraping
    final_result = scraper.build(url,items_list)
# display result
    
    
    results=scraper.get_result_similar(url,grouped=True,keep_order=True)
    result={}
    for key,value in results.items():
        if value not in result.values():
            result[key]=value
            
    orient_df=pd.DataFrame.from_dict(result,orient="index")
    df=orient_df.transpose()
    
    df.columns=value_list
    df.fillna(value=pd.np.nan,inplace=True)
    st.write(df)
    
    cols=df.columns.tolist()
    col1,col2=st.beta_columns(2)
 
    target=col1.selectbox("Select Target", cols,key="target")


    
    typelist=['binary','multiclass','regression','time series regression','time series multiclass','time series binary']
    p_type=col2.selectbox("Select problem type",typelist,key="p_type")     
    st.write("hey")
    x=df.drop(columns=target)
    y=df[target]
    x_train,x_test,y_train,y_test=evalml.preprocessing.split_data(x,y,problem_type=p_type)

    automl = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type)
    automl.search()


    rank=automl.rankings

#checking best pipeline     ###############################################################

    best_pipeline=automl.best_pipeline
    description=automl.describe_pipeline(automl.rankings.iloc[0]["id"])

### OPtimize the code 


### Evaluate on hold out data
    problem_list=['binary','time series binary']
    problem_list2=['multiclass','time series multiclass']

    cola,col_b,colc=st.beta_columns(3)
    
    if p_type in problem_list:
        objective=col_b.selectbox("select objective",objectives().binary_obj,key="objective selector")  
        best_pipeline.score(x_test, y_test, objectives=["auc","f1","Precision","Recall"])

        automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['f1', 'precision'],
                                         max_batches=1,
                                         optimize_thresholds=True)

        automl_tunned.search()

        tunned_rankings=automl_tunned.rankings

        tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

        tunned_pipeline= automl_tunned.best_pipeline

        tunned_pipeline.score(x_test, y_test,  objectives=[objective])

        pred=tunned_pipeline.predict_proba(x_test).to_dataframe()


# for multiclass type problem
    elif p_type in problem_list2:
        objective=col_b.selectbox("select objective",objectives().multiclass_obj,key="objective selector") 
        best_pipeline.score(x_test, y_test, objectives=["log loss multiclass","MCC multiclass","accuracy multiclass"])

        automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['MCC multiclass', 'accuracy multiclass'],
                                         max_batches=1,
                                         optimize_thresholds=True)

        automl_tunned.search()

        tunned_rankings=automl_tunned.rankings

        tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

        tunned_pipeline= automl_tunned.best_pipeline

        tunned_pipeline.score(x_test, y_test,  objectives=[objective])

        pred=tunned_pipeline.predict(x_test).to_series()

    
# for regression type problems
    else:
                objective=col_b.selectbox("select objective",objectives().regression_obj,key="objective selector") 
                best_pipeline.score(x_test, y_test, objectives=["r2","MSE","MAE","Root Mean Squared Error"])
                automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['Root Mean Squared Error', 'MSE','MAE'],
                                         max_batches=1,
                                         optimize_thresholds=True)

                automl_tunned.search()

                tunned_rankings=automl_tunned.rankings

                tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

                tunned_pipeline= automl_tunned.best_pipeline

                tunned_pipeline.score(x_test, y_test,  objectives=[objective])

                tunned_pipeline.fit(x_train,y_train)
                    
                pred=tunned_pipeline.predict(x_test).to_series()
                
    file=open("model_details.txt","w")
    str_dict=repr(tunned_description)
    file.write(str_dict)
    file.close()
    def get_binary_file_downloader_html(bin_file, file_label='File'):
            with open(bin_file, 'rb') as f:
                data = f.read()
                bin_str = base64.b64encode(data).decode()
                href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">Click Here To Download {file_label}</a>'
                return href                
    col1,col2,col3=st.beta_columns([1,1,1])        
    if col2.button("Predict Results",key="output",help="shows results"):
            st.spinner()
            with st.spinner(text='In progress'):
                 st.info("Wait while we are selecting a best algoritham for your problem..Hold your breath.")
                 time.sleep(20)
            st.info("Done. Here you go.")
            st.write(pred)

    col11,col12=st.beta_columns([3,1])
    with col11:
        with st.beta_expander("Compare Models"):
                st.write(tunned_rankings)
        
    with col12:
        with st.beta_expander("Best Pipeline"):
                st.success(tunned_pipeline)
                st.markdown(get_binary_file_downloader_html('model_details.txt', 'Pipeline Details'), unsafe_allow_html=True)

示例#15

0

显示文件

from autoscraper import AutoScraper
from flask import Flask, request, escape

flipkart_scraper = AutoScraper()
flipkart_scraper.load('flipkart-search')
app = Flask(__name__)


def get_flipkart_result(search_query):
    url = 'https://www.flipkart.com/search?q=%s&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off' % search_query

    result = flipkart_scraper.get_result_similar(url, group_by_alias=True)
    return _aggregate_result(result)


def _aggregate_result(result):
    final_result = []
    print(list(result.values())[0])
    print(list(result.values())[1])
    for alias in result:
        print(alias)
    for i in range(len(list(result.values())[0])):
        try:

            final_result.append({alias: result[alias][i] for alias in result})
        except:
            pass
    return final_result


@app.route('/', methods=['GET'])

示例#16

0

显示文件

文件： finviz_autoscraper.py 项目： MutzDude/Finance-Functions

from autoscraper import AutoScraper
import pandas as pd

pd.set_option('display.max_rows', None)

tickers = ['SCHB', 'AMZN', 'AAPL', 'MSFT', 'TSLA', 'AMD', 'NFLX']

scraper = AutoScraper()
scraper.load('../finviz_table')

for ticker in tickers:
    url = f'https://finviz.com/quote.ashx?t={ticker}'
    result = scraper.get_result(url)[0]

    index = result.index('Index')
    df = pd.DataFrame(zip(result[index:], result[:index]),
                      columns=['Attributes', 'Values'])

    print(f'\n{ticker} Data: ')
    print(df.set_index('Attributes'))

示例#17

0

显示文件

文件： api_server.py 项目： ezeellena/AutoScraper

import json

from autoscraper import AutoScraper
from flask import Flask, request, render_template

PAGINA_scraper = AutoScraper()
PAGINA_scraper.load('./rosario3-search')
app = Flask(__name__)


def get_pagina_result(url, link):
    PAGINA_scraper = AutoScraper()
    PAGINA_scraper.load('./' + url + '-search')
    result = PAGINA_scraper.get_result_similar(link, group_by_alias=True)
    return _aggregate_result(result)


def _aggregate_result(result):
    final_result = []
    for i in range(len(list(result.values())[0])):
        final_result.append({alias: result[alias][i] for alias in result})
    return final_result


@app.route('/AutoScraper', methods=['GET'])
def autoscraper(Link=None, Metodo=None):
    url = request.args["Link"]
    link = request.args["Link"]
    Metodo = request.args["Metodo"]
    wanted_list = [Metodo]
    scraper = AutoScraper()

示例#18

0

显示文件

文件： test_async.py 项目： tarasivashchuk/autoscraper

async def test_async_autoscraper():
    scraper = AutoScraper()
    scraper.use_async = True
    result = scraper.build(url, wanted_list)
    print(result)

示例#19

0

显示文件

文件： conf_scraper.py 项目： a7cha/Hacktoberfest-4

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1RI_PvSSKJl-t3dGJNxeTIXiRXPbrbzyM
"""

!pip install git+https://github.com/alirezamika/autoscraper.git



from autoscraper import AutoScraper

url = 'http://wikicfp.com/cfp/call?conference=medical%20imaging&page=2'
category = ['Event']

scrape = AutoScraper()
final = scrape.build(url, category)
print(final[])

for i in range(6,len(final),5):
  print(final[i]+'\n')

"""# **Main Code Begins Here --**"""

import pandas as pd
import numpy as np

single_topics = ['5G','aerospace','automation','blockchain','bussiness','cancer','economics']
double_topics = [['medical','imaging'],['fuzzy','systems']]

events = []