示例#1
0
def run_bot():
    BASE_URL = 'https://www.chamber.nyc/directory.php?search=&page='
    NUM_OF_PAGES = 42
    # save_pages(BASE_URL, NUM_OF_PAGES)

    filename = "data/2021/member.csv"
    f = open(filename, "a+", encoding="utf-8")
    for file in sorted(glob.glob('data/2021/saved_pages/*.html'),
                       key=lambda x: int(x.split('\page')[1][:-5])):
        page_soup = parse.load_html(file)
        extracted_data = parse.parse_html(page_soup)
        f.write(extracted_data)
示例#2
0
        def search_for(key):
            search_field.clear()
            search_field.send_keys(key)
            command_click(driver, search_button)

            # NOTE little hack-ish, but can't find anything better in the API
            # (the search results open in a window with the same title)
            for handle in driver.window_handles:
                if handle != root_handle:
                    # any handle that isn't the root_handle
                    new_window = handle

            # extract results from new window
            driver.switch_to_window(new_window)
            results = parse_html(driver.page_source)

            # Close window, switch back to search page
            driver.close()
            driver.switch_to_window(root_handle)

            return results
示例#3
0
def main():

    dir_path = './raw_html'
    #dir_path = input("Input data file path: ")

    df_record = []

    file_list = get_file_list(dir_path)
    #print(file_list)

    for file_name in file_list:
        file_path = os.path.join(dir_path, file_name)
        tree = read_html(file_path)

        record = parse_html(tree)
        df_record.append(record)

    df = pd.DataFrame(df_record)

    lagou = clean_data(df)
    #print(df.head())
    print(lagou)

    lagou.to_csv('./output/lagou.csv', encoding='gbk')
示例#4
0
文件: main.py 项目: LuYangP/weibo
def main():
    LOGIN = True
    username = sys.argv[1]
    password = sys.argv[2]

    s = requests.Session()
    if LOGIN:
        s.cookies = login(username, password)
    else:
        r = s.get(cons.VISITOR_INCARNATE)

    r = s.get(cons.WEIBO_MAIN)
    r = r.content.decode('utf8')
    uid = re.search("\$CONFIG\['uid'\]='([0-9]+)';", r).group(1)
    count = 0
    for i in range(1, 11):
        r = s.get(cons.WEIBO_HOME_NUMPAGE.format(uid, i-1, i))
        print('PAGE:', i, 'HTML')
        count += parse_html(r.content.decode('utf8'))
        for j in range(0, 2):
            r = s.get(cons.WEIBO_HOME_AJAX.format(i, i, j))
            print('PAGE:', i, 'BAR:', j)
            count += parse_ajax(r.content.decode('unicode-escape'))
    print(count)
示例#5
0
def fetch_remote_item_data():
    return parse_html(fetch_html())
示例#6
0
文件: main.py 项目: adregan/penncrawl
import json
from output import create_output_dir
from output import save_author_data
from parse import parse_html

if __name__ == '__main__':
    with open('authors.json', 'r') as file:
        authors = json.loads(file.read())

    output_path = create_output_dir()

    for author in authors:
        name = author.get('author').strip()
        link = author.get('link')
        html = author.get('html')
        recordings = parse_html(html, link)
        author_data = {'name': name, 'link': link, 'recordings': recordings}
        save_author_data(name, author_data, output_path)
示例#7
0
文件: main.py 项目: adregan/penncrawl
import json
from output import create_output_dir
from output import save_author_data
from parse import parse_html

if __name__ == '__main__':
    with open('authors.json', 'r') as file:
        authors = json.loads(file.read())

    output_path = create_output_dir()

    for author in authors:
        name = author.get('author').strip()
        link = author.get('link')
        html = author.get('html')
        recordings = parse_html(html, link)
        author_data = {
            'name': name,
            'link': link,
            'recordings': recordings
        }
        save_author_data(
            name,
            author_data,
            output_path
        )