Пример #1
0
    l_cursor.close()

    # results output
    print('Writing results to', l_resultPath)
    with open(l_resultPath, 'w') as l_fOutA:
        l_csvWriterA = \
            csv.writer(l_fOutA, delimiter=';', quotechar='"', lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC)
        l_csvWriterA.writerow(['ID', 'MERGE_KEY', 'NAME', 'ADDRESS', 'CP', 'CITY', 'CREATION',
                               'SIRET', 'TYPE', 'COUNT', 'OWNER', 'TEL1', 'TEL2', 'TEL3', 'TEL4',
                               'MAIL', 'WEB1', 'WEB2', 'WEB3', 'WEB4', 'HOURS', 'BUSINESS', 'ADDITIONAL'])

        l_cursor = l_connector.cursor(buffered=True)
        l_cursor.execute('select * from `{0}` order by left(CP,2),NAME'.format(l_resultView))

        for l_row in l_cursor:
            l_rowCsv = [CommonFunctions.cleanField(x) for x in list(l_row)]
            l_csvWriterA.writerow(l_row)

        l_cursor.close()

    # origin identification
    l_tbId = 0
    l_joinBlock = ''
    l_columnsBlock = ''
    for l_tbA, l_tbB in l_tables:
        l_viewName = re.sub('^TB_', 'V_', l_tbA)
        l_joinBlock += '            ' + \
            'left outer join `{0}` as `V{1}` on R.MERGE_KEY = `V{1}`.MERGE_KEY\n'.format(l_viewName, l_tbId)
        l_columnsBlock += '            ' + \
            ', if(isnull(`V{0}`.MERGE_KEY), "", "X") as `{1}`\n'.format(l_tbId, l_tbA)
Пример #2
0
def doOneCompany(p_driver, p_url, p_fOutMain, p_fOutSecondary, p_minDelay, p_maxDelay, p_id):
    print('---[{0}]---'.format(p_id))

    l_wait = 60
    l_finished = False
    while not l_finished:
        # go to the base Url
        p_driver.get(p_url)

        try:
            WebDriverWait(p_driver, 10).until(EC.presence_of_element_located(
                (By.XPATH, '//footer')))
        except EX.TimeoutException:
            print('Footer not found ... Something is not right')
            return

        # get page HTML
        l_pageHtml = p_driver.find_element_by_xpath('//body').get_attribute('innerHTML')

        # extract a full xml/html tree from the page
        l_tree = html.fromstring(l_pageHtml)

        print(p_url, '-->', len(l_pageHtml))

        l_name = CommonFunctions.getUnique(l_tree, '//h1[@itemprop="name"]')
        l_name = CommonFunctions.cleanField(l_name)
        print('   l_name        :', l_name)

        if l_name == '':
            print('Waiting for {0} seconds ...'.format(l_wait))
            time.sleep(l_wait)
            l_wait *= 2
            continue

        l_address = CommonFunctions.getUnique(l_tree, '//span[@itemprop="streetAddress"]')
        l_address = CommonFunctions.cleanField(l_address)
        print('   l_address     :', l_address)
        l_zip = CommonFunctions.getUnique(l_tree, '//span[@itemprop="postalCode"]')
        l_zip = re.sub('[^\d]', '', l_zip)
        l_zip = CommonFunctions.cleanField(l_zip)
        print('   l_zip         :', l_zip)
        l_city = CommonFunctions.getUnique(l_tree, '//span[@itemprop="addressLocality"]')
        l_city = CommonFunctions.cleanField(l_city)
        print('   l_city        :', l_city)
        l_web = CommonFunctions.getUnique(l_tree, '//p[@class="websiteAndShare"]/a')
        l_web = CommonFunctions.cleanField(l_web)
        print('   l_web         :', l_web)

        p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format(
            # ID
            p_id,
            # TYPE
            'WebSite',
            # RAW
            l_web,
            # CLEAN
            l_web,
            # FLAG
            '')
        )

        l_telList = []
        for l_telItem in l_tree.xpath('//p[@itemprop="telephone"]'):
            l_oneTel = l_telItem.text_content()
            print('   Tel           :', l_oneTel)
            p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format(
                # ID
                p_id,
                # TYPE
                'UnspecifiedPhone',
                # RAW
                l_oneTel,
                # CLEAN
                l_oneTel,
                # FLAG
                '')
            )
            l_telList += [l_oneTel]

        l_businessList = []
        for l_businessRow in l_tree.xpath('//li[@class="label-child"]'):
            l_businessCategory = l_businessRow.text_content()
            l_businessCategory = CommonFunctions.cleanField(l_businessCategory)
            print('   Business      :', l_businessCategory)

            l_businessList += [l_businessCategory]

            p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format(
                # ID
                p_id,
                # TYPE
                'BusinessCategory',
                # RAW
                l_businessCategory,
                # CLEAN
                l_businessCategory,
                # FLAG
                '')
            )

        l_telList = (l_telList + ['', '', '', ''])[0:4]

        # output to CSV file (main)
        p_fOutMain.write(
            ('{0};"{1}";"{2}";"{3}";"{4}";"{5}";"{6}";"{7}";"{8}";"{9}";' +
             '"{10}";"{11}";"{12}";"{13}";"{14}";"{15}";"{16}";"{17}";"{18}"\n').format(
            # ID
            p_id,
            # NAME
            re.sub('"', '""', CommonFunctions.cleanField(l_name)),
            # ADDRESS
            re.sub('"', '""', CommonFunctions.cleanField(l_address)),
            # CP
            re.sub('"', '""', CommonFunctions.cleanField(l_zip)),
            # CITY
            re.sub('"', '""', CommonFunctions.cleanField(l_city)),
            # CREATION
            '',
            # SIRET
            '',
            # TYPE
            '',
            # COUNT
            '',
            # OWNER
            '',
            # TEL1 - TEL4
            '";"'.join([re.sub('"', '""', CommonFunctions.cleanField(t)) for t in l_telList]) ,
            # MAIL
            '',
            # WEB1
            re.sub('"', '""', CommonFunctions.cleanField(l_web)),
            # WEB2
            '',
            # WEB3
            '',
            # WEB4
            '',
            # HOURS
            '',
            # BUSINESS
            re.sub('"', '""', CommonFunctions.cleanField('|'.join(l_businessList))),
            # ADDITIONAL
            ''
        ))
        p_fOutMain.flush()
        p_fOutSecondary.flush()

        l_finished = True

    return True
Пример #3
0
def getOneCompany(p_driver, p_fOutMain, p_fOutSecondary, p_url, p_id):
    print('---[{0}]---'.format(p_id))

    p_driver.get(p_url)
    # l_request.encoding = l_request.apparent_encoding

    WebDriverWait(p_driver, 10).until(EC.presence_of_element_located(
        (By.XPATH, '//div[@class="shareButtons"]')))

    # get page HTML
    l_pageHtml = p_driver.find_element_by_xpath('//body').get_attribute('innerHTML')

    # extract a full xml/html tree from the page
    l_tree = html.fromstring(l_pageHtml)

    print(p_url, '-->', len(l_pageHtml))

    l_name = CommonFunctions.getUnique(l_tree, '//h2/span[@itemprop="name"]')
    print('   l_name        :', l_name)
    l_address = CommonFunctions.getUnique(l_tree, '//div/span[@itemprop="streetAddress"]')
    print('   l_address     :', l_address)
    l_zip = CommonFunctions.getUnique(l_tree, '//div/span[@itemprop="postalCode"]')
    print('   l_zip         :', l_zip)
    l_city = CommonFunctions.getUnique(l_tree, '//div/a[@itemprop="addressLocality"]/..')
    print('   l_city        :', l_city)

    l_creation = CommonFunctions.getUnique(l_tree, '//div/b[.="Créé:"]/../following-sibling::div[1]')
    print('   l_creation    :', l_creation)
    l_tvaSiret = CommonFunctions.getUnique(l_tree, '//div/b[.="TVA / SIRET:"]/../following-sibling::div[1]')
    print('   l_tvaSiret    :', l_tvaSiret)
    l_type = CommonFunctions.getUnique(l_tree, '//div/b[.="Type d\'entreprise:"]/../following-sibling::div[1]')
    print('   l_type        :', l_type)
    l_headCount = CommonFunctions.getUnique(l_tree, '//div/b[.="Nombre d\'employés:"]/../following-sibling::div[1]')
    print('   l_headCount   :', l_headCount)
    l_owner = CommonFunctions.getUnique(l_tree, '//div/b[.="Propriétaire / PDG:"]/../following-sibling::div[1]')
    print('   l_owner       :', l_owner)

    l_name = CommonFunctions.cleanField(l_name)
    l_address = CommonFunctions.cleanField(l_address)
    l_zip = CommonFunctions.cleanField(l_zip)
    l_city = CommonFunctions.cleanField(l_city)
    l_creation = CommonFunctions.cleanField(l_creation)
    l_tvaSiret = CommonFunctions.cleanField(l_tvaSiret)
    l_type = CommonFunctions.cleanField(l_type)
    l_headCount = CommonFunctions.cleanField(l_headCount)
    l_owner = CommonFunctions.cleanField(l_owner)

    l_telNumber = ''
    for l_tel in l_tree.xpath('//div/span[@itemprop="telephone"]'):
        l_telNumber = CommonFunctions.cleanField(l_tel.text_content())
        print('   l_telNumber   :', l_telNumber)

    l_faxNumber = ''
    for l_fax in l_tree.xpath('//div/span[@itemprop="faxNumber"]'):
        l_faxNumber = CommonFunctions.cleanField(l_fax.text_content())
        print('   l_faxNumber   :', l_faxNumber)

    l_mobileNumber = ''
    for l_mobile in l_tree.xpath('//span/i[@class="icon-mobile-phone"]/../../span'):
        l_mobileNumber = CommonFunctions.cleanField(l_mobile.text_content())
        print('   l_mobileNumber:', l_mobileNumber)

    l_webSite = ''
    for l_web in l_tree.xpath('//span/i[@class="icon-globe"]/../../a'):
        l_webSite = CommonFunctions.cleanField(l_web.text_content())
        print('   l_website     :', l_webSite)

    l_mailAddressRaw = ''
    l_mailAddress = ''
    for l_mail in l_tree.xpath('//span/i[@class="icon-envelope-alt"]/../../a'):
        if l_mail.text_content() == "afficher l'email":
            l_mailImgUrl = getEmail(p_driver, p_url)
            print('   mail img      :', l_mailImgUrl)

            l_mailImgPathRaw = os.path.join(g_emailDir, 'mail_{0}_{1}.png'.format(p_id, re.sub('\s+', '_', l_name)))
            l_mailImgPath = os.path.join(g_emailDir, 'mail_{0}_{1}-X.png'.format(p_id, re.sub('\s+', '_', l_name)))
            urllib.request.urlretrieve(l_mailImgUrl, l_mailImgPathRaw)

            # convert email.gif -resize 2000x -unsharp 0x8 -threshold 95% x.png
            subprocess.call([
                'convert',
                l_mailImgPathRaw,
                '-flatten',
                '-resize', '10000x',
                #'-morphology', 'erode:1', 'square',
                #'-unsharp', '0x8',
                '-threshold', '30%',
                l_mailImgPath
            ])
            l_mailAddressRaw = subprocess.check_output([
                'tesseract',
                l_mailImgPath,
                'stdout'
            ])
            print('   Mail raw      :', l_mailAddressRaw)

            l_work = repr(l_mailAddressRaw)
            l_work = re.sub(r'Q\\xef\\xac\\x81', '@', l_work).strip()
            l_work = re.sub('\\\\', r'\\', l_work).strip()
            l_mailAddressRaw = eval(l_work)

            l_mailAddress = l_mailAddressRaw.decode('utf-8')

            # l_mailAddress = re.sub('Qfi', '@', l_mailAddress).strip()
            l_mailAddress = re.sub('\s+', '', l_mailAddress).strip()
            l_mailAddress = re.sub('[‒–—―]', '-', l_mailAddress).strip()

            for l_end in ['com', 'fr', 'net']:
                l_mailAddress = re.sub('([^\.]){0}$'.format(l_end), r'\1.' + l_end, l_mailAddress).strip()

            # for b in l_mailAddressRaw:
            #    print(b, '--->', chr(b))
            l_mailAddress = CommonFunctions.cleanField(l_mailAddress)
            print('   l_mailAddress :', l_mailAddress)

    l_businessList = []
    for l_businessRow in l_tree.xpath('//dl/div[@class="col-sm-9"]/a'):
        l_businessCategoryRaw = l_businessRow.text_content()

        l_match = re.match('^([^-]+)\s-', l_businessCategoryRaw)
        if l_match:
            l_businessCategory = l_match.group(1)
        print('   Business      :', l_businessCategory)
        l_businessList += [l_businessCategory]

        # ID;;;;
        p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format(
            # ID
            p_id,
            # TYPE
            'BusinessCategory',
            # RAW
            l_businessCategoryRaw,
            # CLEAN
            l_businessCategory,
            # FLAG
            '')
        )

    if l_telNumber != '':
        p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format(
            p_id, 'FixedPhone', l_telNumber, l_telNumber, ''))
    if l_faxNumber != '':
        p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format(
            p_id, 'Fax', l_faxNumber, l_faxNumber, ''))
    if l_mobileNumber != '':
        p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format(
            p_id, 'MobilePhone', l_mobileNumber, l_mobileNumber, ''))
    if l_webSite != '':
        p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format(
            p_id, 'WebSite', l_webSite, l_webSite, ''))
    if l_mailAddress != '':
        p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format(
            p_id, 'Email', l_mailAddressRaw, l_mailAddress, ''))

    # output to CSV file (main)
    p_fOutMain.write(
        ('{0};"{1}";"{2}";"{3}";"{4}";"{5}";"{6}";"{7}";"{8}";"{9}";' +
         '"{10}";"{11}";"{12}";"{13}";"{14}";"{15}";"{16}";' +
         '"{17}";"{18}";"{19}";"{20}";"{21}"\n').format(
        # ID
        p_id,
        # NAME
        re.sub('"', '""', CommonFunctions.cleanField(l_name)),
        # ADDRESS
        re.sub('"', '""', CommonFunctions.cleanField(l_address)),
        # CP
        re.sub('"', '""', CommonFunctions.cleanField(l_zip)),
        # CITY
        re.sub('"', '""', CommonFunctions.cleanField(l_city)),
        # CREATION
        re.sub('"', '""', CommonFunctions.cleanField(l_creation)),
        # SIRET
        re.sub('"', '""', CommonFunctions.cleanField(l_tvaSiret)),
        # TYPE
        re.sub('"', '""', CommonFunctions.cleanField(l_type)),
        # COUNT
        re.sub('"', '""', CommonFunctions.cleanField(l_headCount)),
        # OWNER
        re.sub('"', '""', CommonFunctions.cleanField(l_owner)),
        # TEL1
        re.sub('"', '""', CommonFunctions.cleanField(l_telNumber)),
        # TEL2
        re.sub('"', '""', CommonFunctions.cleanField(l_faxNumber)),
        # TEL3
        re.sub('"', '""', CommonFunctions.cleanField(l_mobileNumber)),
        # TEL4
        '',
        # MAIL
        re.sub('"', '""', CommonFunctions.cleanField(l_mailAddress)),
        # WEB1
        re.sub('"', '""', CommonFunctions.cleanField(l_webSite)),
        # WEB2
        '',
        # WEB3
        '',
        # WEB4
        '',
        # HOURS
        '',
        # BUSINESS
        re.sub('"', '""', CommonFunctions.cleanField('|'.join(l_businessList))),
        # ADDITIONAL
        ''
    ))
    p_fOutMain.flush()
    p_fOutSecondary.flush()
Пример #4
0
def doOneCompany(p_driver, p_fOutMain, p_fOutSecondary, p_id):
    print('---[{0}]---'.format(p_id))

    l_finished = False
    l_name = ''
    while not l_finished:
        try:
            l_nameItem = WebDriverWait(p_driver, 10).until(EC.presence_of_element_located(
                (By.XPATH, '//h1[@itemprop="name"]')))

            l_name = l_nameItem.text
            l_name = re.sub('\s+Afficher le numéro$', '', l_name).strip()
            print('   l_name           :', l_name)
        except EX.TimeoutException:
            print('[06] Something is badly wrong (Timeout) ...')
            return False

        if killPopup(p_driver):
            continue

        l_finished = True

    try:
        l_html = p_driver.find_element_by_xpath('//body').get_attribute('innerHTML')
        l_tree = html.fromstring(l_html)

        l_address = CommonFunctions.getUnique(l_tree, '//span[@itemprop="streetAddress"]')
        print('   l_address        :', l_address)
        l_zip = CommonFunctions.getUnique(l_tree, '//span[@itemprop="postalCode"]')
        l_zip = re.sub('^[,;:\.]\s+', '', l_zip).strip()
        print('   l_zip            :', l_zip)
        l_city = CommonFunctions.getUnique(l_tree, '//span[@itemprop="addressLocality"]')
        print('   l_city           :', l_city)

        # extract a full xml/html tree from the fragment
        l_telList = []
        for l_telRow in l_tree.xpath('//div[@id="coord-list-container-1"]//ul/li'):
            l_telType = CommonFunctions.getUnique(l_telRow, './span[@class="num-tel-label"]')
            l_telType = re.sub('\s+:$', '', l_telType).strip()

            if l_telType == 'tél':
                l_telType = 'FixedPhone'
            elif l_telType == 'Mobile':
                l_telType = 'MobilePhone'
            elif l_telType == 'Fax':
                l_telType = 'Fax'
            else:
                l_telType = 'UnspecifiedPhone'

            print('   l_telType        :', l_telType)

            l_tel = CommonFunctions.getUnique(l_telRow, './span[@class="coord-numero"]')
            l_tel = re.sub('^\.', '', l_tel).strip()
            l_tel = re.sub('\s+$', '', l_tel).strip()
            l_tel = re.sub('^\s+', '', l_tel).strip()
            print('   l_tel            :', l_tel)
            l_tel = CommonFunctions.cleanField(l_tel)
            l_telList += [l_tel]

            p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format(
                p_id, l_telType, l_tel, l_tel, ''))


        l_webList = []
        # l_root = l_tree.getroottree()
        for l_webRow in l_tree.xpath(
                '//article/div/div/h3[text()="Sites et réseaux sociaux"]/..//ul/li/a/span[@class="value"]'):
            l_webSite = l_webRow.text_content().strip()
            # print('   l_webSite path   :', l_root.getpath(l_webRow))
            print('   l_webSite        :', l_webSite)
            l_webSite = CommonFunctions.cleanField(l_webSite)
            l_webList += [l_webSite]

            p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format(
                p_id, 'WebSite', l_webSite, l_webSite, ''))

        # bloc-info-horaires
        l_hoursList = []
        for l_hoursRow in l_tree.xpath(
                '//ul[@class="liste-horaires-principaux"]//ul/li[@itemprop="openingHours"]'):
            l_hours = l_hoursRow.get('content').strip()
            print('   l_hours          :', l_hours)
            l_hours = CommonFunctions.cleanField(l_hours)
            l_hoursList += [l_hours]

            p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format(
                p_id, 'OpeningHours', l_hours, l_hours, ''))

        l_businessList = []
        for l_businessRow in l_tree.xpath('//span[@class="activite-premiere-visibilite activite"]'):

            l_businessCategory = l_businessRow.text_content().strip()
            l_businessCategory = re.sub('\s+[\.;,:]$', '', l_businessCategory).strip()
            l_businessCategory = re.sub('^[\.;,:]\s+', '', l_businessCategory).strip()
            l_businessCategory = CommonFunctions.cleanField(l_businessCategory)
            print('   l_businessCat.   :', l_businessCategory)
            l_businessList += [l_businessCategory]

            p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format(
                p_id, 'BusinessCategory', l_businessCategory, l_businessCategory, ''))

        # description
        l_additional = ''
        for l_description in l_tree.xpath('//div[@itemprop="description"]'):
            l_additional = l_description.text_content().strip()

            l_additional = re.sub('\s+', ' ', l_additional).strip()
            print('   l_additional     :', l_additional)

        l_telList = (l_telList + ['', '', '', ''])[0:4]
        l_webList = (l_webList + ['', '', '', ''])[0:4]

        # output to CSV file (main)
        p_fOutMain.write(
            ('{0};"{1}";"{2}";"{3}";"{4}";"{5}";"{6}";"{7}";"{8}";"{9}";' +
             '"{10}";"{11}";"{12}";"{13}";"{14}";"{15}"\n').format(
            # ID
            p_id,
            # NAME
            re.sub('"', '""', CommonFunctions.cleanField(l_name)),
            # ADDRESS
            re.sub('"', '""', CommonFunctions.cleanField(l_address)),
            # CP
            re.sub('"', '""', CommonFunctions.cleanField(l_zip)),
            # CITY
            re.sub('"', '""', CommonFunctions.cleanField(l_city)),
            # CREATION
            '',
            # SIRET
            '',
            # TYPE
            '',
            # COUNT
            '',
            # OWNER
            '',
            # TEL1 - TEL4
            '";"'.join([re.sub('"', '""', CommonFunctions.cleanField(t)) for t in l_telList]),
            # MAIL
            '',
            # WEB1 - WEB4
            '";"'.join([re.sub('"', '""', CommonFunctions.cleanField(w)) for w in l_webList]),
            # HOURS
            re.sub('"', '""', CommonFunctions.cleanField('|'.join(l_hoursList))),
            # BUSINESS
            re.sub('"', '""', CommonFunctions.cleanField('|'.join(l_businessList))),
            # ADDITIONAL
            re.sub('"', '""', CommonFunctions.cleanField(l_additional))
        ))
        p_fOutMain.flush()
        p_fOutSecondary.flush()

    except EX.NoSuchElementException:
        print('[07] Something is badly wrong (Element not found) ...')
        return False

    return True