Пример #1
0
def webAndFile():
    data_load.get_traversal_data()
    file_data = indexer.read_data()
    web_data = WebParser.webData()
    print("File data search:")
    print("====================================================")
    FileSearcher.fileSearch(file_data)
    print("Web data search:")
    print("====================================================")
    WebSearcher.webSearcher(web_data)
def search(query, file, num):
    parent_dir = 'RawHTML/' + file
    dir = str(num)
    path = os.path.join(parent_dir, dir)
    os.mkdir(path)
    se = ws.SearchEngine()
    se.search(query)
    soup = ws.make_soup(se.html)
    results = ws.parse_serp(soup)
    se.save_serp(save_dir=path)
    results = [dict(item, question_number=num) for item in results]
    return results
Пример #3
0
import WebSearcher

WebSearcher.webSearchCall()
Пример #4
0
import argparse
import pandas as pd
import WebSearcher as ws

parser = argparse.ArgumentParser()
parser.add_argument("-q", "--query", type=str, help="A search query")
args = parser.parse_args()

if not args.query:
    print('Must include -q arg')
else:
    print(f'Test search | query: {args.query}')

    # Initialize crawler
    se = ws.SearchEngine()

    # Conduct Search
    se.search(args.query)

    # Parse Results
    se.parse_results()

    # Shape as dataframe
    results = pd.DataFrame(se.results)
    print(results.head())

    try:
        se.save_serp(append_to='test_serp_save.json')
        se.save_results(append_to='test_results_save.json')
    except Exception as e:
Пример #5
0
""" Download and use locations
"""
import os
import pandas as pd
import WebSearcher as ws

# Retrieve and save latest location data
data_dir = './location_data'
ws.download_locations(data_dir)

# Read it back in
f = os.listdir(data_dir)[-1]  # Last file
fp = os.path.join(data_dir, f)  # File path
locs = pd.read_csv(fp)  # Read

# locs.info()
#
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 102029 entries, 0 to 102028
# Data columns (total 7 columns):
# Criteria ID       102029 non-null int64
# Name              102029 non-null object
# Canonical Name    102029 non-null object
# Parent ID         101788 non-null float64
# Country Code      102013 non-null object
# Target Type       102029 non-null object
# Status            102029 non-null object
# dtypes: float64(1), int64(1), object(5)
# memory usage: 5.4+ MB

# locs.iloc[0]
Пример #6
0
""" Test parse
"""

import argparse
import pandas as pd
import WebSearcher as ws

parser = argparse.ArgumentParser()
parser.add_argument("-f", "--filepath", help="The SERP html file")
args = parser.parse_args()

if not args.filepath:
    print('Must include -f arg')
else:
    soup = ws.load_soup(args.filepath)
    parsed = ws.parse_serp(soup)
    results = pd.DataFrame(parsed)

    cmpts = ws.extract_components(soup)