def webAndFile(): data_load.get_traversal_data() file_data = indexer.read_data() web_data = WebParser.webData() print("File data search:") print("====================================================") FileSearcher.fileSearch(file_data) print("Web data search:") print("====================================================") WebSearcher.webSearcher(web_data)
def search(query, file, num): parent_dir = 'RawHTML/' + file dir = str(num) path = os.path.join(parent_dir, dir) os.mkdir(path) se = ws.SearchEngine() se.search(query) soup = ws.make_soup(se.html) results = ws.parse_serp(soup) se.save_serp(save_dir=path) results = [dict(item, question_number=num) for item in results] return results
import WebSearcher WebSearcher.webSearchCall()
import argparse import pandas as pd import WebSearcher as ws parser = argparse.ArgumentParser() parser.add_argument("-q", "--query", type=str, help="A search query") args = parser.parse_args() if not args.query: print('Must include -q arg') else: print(f'Test search | query: {args.query}') # Initialize crawler se = ws.SearchEngine() # Conduct Search se.search(args.query) # Parse Results se.parse_results() # Shape as dataframe results = pd.DataFrame(se.results) print(results.head()) try: se.save_serp(append_to='test_serp_save.json') se.save_results(append_to='test_results_save.json') except Exception as e:
""" Download and use locations """ import os import pandas as pd import WebSearcher as ws # Retrieve and save latest location data data_dir = './location_data' ws.download_locations(data_dir) # Read it back in f = os.listdir(data_dir)[-1] # Last file fp = os.path.join(data_dir, f) # File path locs = pd.read_csv(fp) # Read # locs.info() # # <class 'pandas.core.frame.DataFrame'> # RangeIndex: 102029 entries, 0 to 102028 # Data columns (total 7 columns): # Criteria ID 102029 non-null int64 # Name 102029 non-null object # Canonical Name 102029 non-null object # Parent ID 101788 non-null float64 # Country Code 102013 non-null object # Target Type 102029 non-null object # Status 102029 non-null object # dtypes: float64(1), int64(1), object(5) # memory usage: 5.4+ MB # locs.iloc[0]
""" Test parse """ import argparse import pandas as pd import WebSearcher as ws parser = argparse.ArgumentParser() parser.add_argument("-f", "--filepath", help="The SERP html file") args = parser.parse_args() if not args.filepath: print('Must include -f arg') else: soup = ws.load_soup(args.filepath) parsed = ws.parse_serp(soup) results = pd.DataFrame(parsed) cmpts = ws.extract_components(soup)