Пример #1
0
def start_info_collection(folder_name):
    """

    :param folder_name: category_folder name
    :return: None
    """
    queue_path = '{}{}{}'.format(
        DataCollectors_Configuration.LINIO_MEX_URL_ROOT,
        DataCollectors_Configuration.PATH_STYLE, folder_name)
    queue_files = get_all_files(queue_path,
                                DataCollectors_Configuration.PATTERN_3)

    completed_path = '{}{}{}'.format(
        DataCollectors_Configuration.LINIO_MEX_INFO_ROOT,
        DataCollectors_Configuration.PATH_STYLE, folder_name)
    completed_files = get_all_files(completed_path,
                                    DataCollectors_Configuration.PATTERN_4)

    completed_set = set()
    for files in completed_files:
        if os.path.exists(files):
            links = file_to_set(files)
            for link in links:
                completed_set.add(link)
                # print link

    for files in queue_files:
        if os.path.exists(files):
            for url in file_to_set(files):
                # print(url)
                url_split = url.split('|')
                hierarchy_path = '/'.join(url_split[2:-1])
                if url in completed_set:
                    # print url
                    pass
                else:

                    completed_path = '{}{}{}'.format(
                        DataCollectors_Configuration.LINIO_MEX_INFO_ROOT,
                        DataCollectors_Configuration.PATH_STYLE,
                        hierarchy_path)

                    if os.path.exists(completed_path):

                        pass

                    else:
                        create_project_dir(completed_path)
                    urls_queue.put(url)

                    # get_product_urls(line)
    urls_queue.join()
def start_url_collection(folder_name):
    """

    :param folder_name: category_folder name
    :return: None
    """
    queue_path = '{}{}{}'.format(DataCollectors_Configuration.LINIO_MEX_HIERARCHY_ROOT, DataCollectors_Configuration.PATH_STYLE, folder_name)
    queue_files = get_all_files(queue_path, DataCollectors_Configuration.PATTERN_1)

    completed_path = '{}{}{}'.format(DataCollectors_Configuration.LINIO_MEX_URL_ROOT, DataCollectors_Configuration.PATH_STYLE, folder_name)
    completed_files = get_all_files(completed_path,DataCollectors_Configuration.PATTERN_2)

    completed_set = set()
    for files in completed_files:
        if os.path.exists(files):
            links = file_to_set(files)
            for link in links:
                completed_set.add(link)
                # print link

    for files in queue_files:
        if os.path.exists(files):
            for url in file_to_set(files):

                url_split = url.split('|')
                last_page = int(url_split[-2])

                page_url = url_split[-1]

                hierarchy = '|'.join(url_split[0:-2])

                hierarchy_path = '/'.join(url_split[2:-2])

                for page_no in range(1, last_page+1):
                    modified_url = '{}?page={}'.format(page_url, page_no)
                    line = '{}|{}'.format(hierarchy, modified_url)
                    if line in completed_set:
                        pass
                    else:
                        completed_path = '{}{}{}'.format(DataCollectors_Configuration.LINIO_MEX_URL_ROOT,
                                                         DataCollectors_Configuration.PATH_STYLE, hierarchy_path)

                        if os.path.exists(completed_path):
                            pass
                        else:
                            create_project_dir(completed_path)
                        urls_queue.put(line)

                        # get_product_urls(line)
    urls_queue.join()
Пример #3
0
 def boot() -> None:
     create_project_dir(Spider.project_name)
     create_data_files(Spider.project_name, Spider.base_url)
     Spider.queue = file_to_set(Spider.queue_file)
     Spider.crawled = file_to_set(Spider.crawled_file)
Пример #4
0
def create_jobs() -> None:
    for link in file_to_set(QUEUE_FILE):
        queue.put(link)
        queue.join()
        crawl()
Пример #5
0
def crawl() -> None:
    queued_links = file_to_set(QUEUE_FILE)
    if len(queued_links) > 0:
        print(str(len(queued_links)) + " Links in the queue ")
        create_jobs()