Python segmentByTerms примеры, mySoupFuncs.segmentByTerms Python примеры использования

Пример #1

0

Показать файл

Файл: Crawler_companies.py Проект: Crypto-Fox/Firmenbuch

def getAssociations(companyPage, name, visitedLinks, depth, depthLimit):
    #Set default value of our lists to NA
    employees = "NA"
    stakeholders = "NA"
    investments = "NA"

    # Get the relevant blue box section
    crefoSoup = companyPage.find("div", id="crefo")

    # Split between handelnde personen and beteiligungen
    terms = [
        "Handelnde Personen:", "Anteilseigner:",
        "Beteiligungen von " + name.strip()
    ]
    segments = mySoupFuncs.segmentByTerms(crefoSoup, terms)

    #If we have results
    if segments != "NA":
        for i in range(len(segments)):
            soup = mySoupFuncs.toSoup(segments[i][0])
            if segments[i][1] == "Handelnde Personen:":
                employees = getEmployeeInfo(soup, visitedLinks, depth,
                                            depthLimit)
            elif segments[i][1] == "Anteilseigner:":
                stakeholders = getStakeholders(soup, visitedLinks, depth,
                                               depthLimit)
            elif segments[i][1] == "Beteiligungen von " + name.strip():
                investments = getInvestments(soup, visitedLinks, depth,
                                             depthLimit)

    createCompanyEdges(stakeholders, investments, employees, name)

    return stakeholders, investments, employees

Пример #2

0

Показать файл

Файл: Crawler_companies.py Проект: Crypto-Fox/Firmenbuch

def getEmployeeInfo(soup, visitedLinks, depth, depthLimit):
    # Data Structures we will reuturn
    employees = dict()
    fuehrer = dict()
    aufsichtsrat = dict()
    vorstand = dict()

    # Get info for handelnde personen first
    terms = [u"Vorstand", u"Geschäftsführer", u"Aufsichtsrat"]
    segments = mySoupFuncs.segmentByTerms(soup, terms)
    if segments == "NA": return "NA"

    # Retrieve the relevant information from the data structures
    for segment in segments:
        segmentSoup = mySoupFuncs.toSoup(segment[0])
        person = dict()
        # Get the name
        stringSection = repr(segmentSoup.getText())
        #try:
        # If someone has a name with Herrn or Frau
        if "Herrn" in segment[0]:
            start = stringSection.index("Herrn")
            end = stringSection[start:].index("\\r")
            person["Name"] = (stringSection[start:start + end])
        elif "Frau" in segment[0]:
            start = stringSection.index("Frau")
            end = stringSection[start:].index("\\r")
            person["Name"] = (stringSection[start:start + end])
        else:
            start = segment[0].index("<br/>") + 5
            end = segment[0][start:].index("<br/>")
            person["Name"] = (stringSection[start:start + end]).strip()

        # Get link to person and more detailed information
        if segmentSoup.find("a"):
            person["Link"] = segmentSoup.find("a")["href"]
            if person["Link"] not in visitedLinks:
                visitedLinks.append(person["Link"])
                person = Crawler_person.getPersonInfo(person, visitedLinks,
                                                      depth, depthLimit)

        # Append to the relevant dict
        if segment[1] == "Vorstand":
            vorstand[person["Name"]] = person
        elif segment[1] == "Aufsichtsrat":
            aufsichtsrat[person["Name"]] = person
        elif segment[1] == u"Geschäftsführer":
            fuehrer[person["Name"]] = person
        else:
            print "NEW PERSON TYPE"
        #except:
        #    print "FAILURE TO GET PERSON",segment

        #Add the new data to existing dict
        employees[u"Geschäftsführer"] = fuehrer
        employees[u"Aufsichtsrat"] = aufsichtsrat
        employees[u"Vorstand"] = vorstand

    return employees

Пример #3

0

Показать файл

Файл: Crawler_companies.py Проект: Crypto-Fox/Firmenbuch

def getInvestments(soup, visitedLinks, depth, depthLimit):
    # Data Structures
    investments = dict()
    gesellschafter = dict()
    aktionaer = dict()

    # Get segments for investments
    terms = [u"Gesellschafter", u"Aktionär"]
    segments = mySoupFuncs.segmentByTerms(soup, terms)
    if segments == "NA": return "NA"

    # Retrieve the relevant information from the data structures
    for segment in segments:
        segmentSoup = mySoupFuncs.toSoup(segment[0])

        # If we have a foreign company
        company = dict()
        if "(Ausland)" in segment[0]:
            company["Name"] = "Failure: Ausland"

        else:
            try:
                #Get the basic company information
                company["Name"] = (segmentSoup.find("a").getText().strip())
                company["Link"] = (segmentSoup.find("a")["href"])
                try:
                    company["Anteil"] = str(
                        re.findall("\d+\,\d+", segment[0])[0].strip()) + "\%"
                except:
                    company["Anteil"] = "NA"

                #Get more detailed information for the company
                if company["Link"] not in visitedLinks:
                    visitedLinks.append(company["Link"])
                    page = mySoupFuncs.getSoup(company["Link"])
                    company = getDetailedInfo(page, company, visitedLinks,
                                              depth + 1, depthLimit)

                #Add to the relevant section
                if segment[1] == u"Gesellschafter":
                    gesellschafter[company["Name"]] = company
                elif segment[1] == u"Aktionär":
                    aktionaer[company["Name"]] = company
                else:
                    print "NEW COMPANY TYPE"

            except:
                print "Couldnt retrieve investments. Data with no link."

    investments["Gesellschafter"] = gesellschafter
    investments["Aktionär"] = aktionaer

    return investments

Пример #4

0

Показать файл

def getPersonInfo(personDict, visitedCompanies, depth, depthLimit):
    uniqueID = str(random.randrange(100000000))
    nodes.append(personDict["Name"], uniqueID)
    person = personDict
    person["Investments"] = dict()
    person["Functions"] = dict()
    url = personDict["Link"]
    soup = mySoupFuncs.getSoup(url)
    body = soup.find("div", class_="modal-body")
    sections = mySoupFuncs.segmentByTerms(body, ["<h2 class=\"h4\">"])

    if sections == "NA": return personDict

    #Now get beteiligungen and funktionen
    for section in sections:
        section = mySoupFuncs.toSoup(section[0])
        if "Beteiligungen" in section.getText():
            result = section.find_all("a", class_="blue")
            for subsection in result:
                company = dict()
                company["Name"] = subsection.getText().replace(
                    "\r", " ").replace("\n", " ").strip()
                company["Link"] = subsection["href"]
                if company["Link"] not in visitedCompanies:
                    edges.append((personDict["Name"], company["Name"]))
                    visitedCompanies.append(company["Link"])
                    page = mySoupFuncs.getSoup(company["Link"])
                    person["Investments"][
                        company["Name"]] = Crawler_companies.getDetailedInfo(
                            page, company, visitedCompanies, depth + 1,
                            depthLimit)

        elif "Funktionen" in section.getText():
            result = section.find_all("a", class_="blue")
            for subsection in result:
                company = dict()
                company["Name"] = subsection.getText().replace(
                    "\r", " ").replace("\n", " ").strip()
                company["Link"] = subsection["href"]
                if company["Link"] not in visitedCompanies:
                    edges.append((personDict["Name"], company["Name"]))
                    visitedCompanies.append(company["Link"])
                    page = mySoupFuncs.getSoup(company["Link"])
                    person["Functions"][
                        company["Name"]] = Crawler_companies.getDetailedInfo(
                            page, company, visitedCompanies, depth + 1,
                            depthLimit)

    #Now we return the result
    return person

Пример #5

0

Показать файл

def getCompanySections(pages, terms, where):
    # Get the relevant segments
    relevantSegments = list()
    companySections = list()
    count = 1
    for page in pages:
        segments = mySoupFuncs.segmentByTerms(page, terms)
        for segment in segments:
            if where.lower() in mySoupFuncs.toSoup(
                    segment[0]).span.next_sibling.next_sibling.string.lower():
                relevantSegments.append(segment[0])

    for segment in relevantSegments:
        # Get all the company sections from this section
        sections = mySoupFuncs.toSoup(segment).find_all("li",
                                                        class_="card result")
        for section in sections:
            companySections.append(section)

    return companySections

Пример #6

0

Показать файл

Файл: Crawler_companies.py Проект: Crypto-Fox/Firmenbuch

def getStakeholders(soup, visitedLinks, depth, depthLimit):
    # Data Structures we will reuturn
    stakeholders = dict()
    stakeholders["Persons"] = dict()
    stakeholders["Companies"] = dict()
    stakeholders["Persons"]["Gesellschafter"] = dict()
    stakeholders["Persons"][u"Aktionär"] = dict()
    stakeholders["Companies"]["Gesellschafter"] = dict()
    stakeholders["Companies"][u"Aktionär"] = dict()

    # Get info for handelnde personen first
    terms = [u"Gesellschafter", u"Aktionär"]
    segments = mySoupFuncs.segmentByTerms(soup, terms)
    if segments == "NA": return "NA"

    # Retrieve the relevant information from the data structures
    for segment in segments:
        segmentSoup = mySoupFuncs.toSoup(segment[0])

        # For companies
        if "Einzelperson" not in segment[0]:
            # If we have a foreign company
            company = dict()
            if "(Ausland)" in segment[0]:
                company["Name"] = "Failure: Ausland"

            else:
                try:
                    # Get the basic company information

                    company["Name"] = (segmentSoup.find("a").getText().strip())
                    company["Name"] = "Bad data. Please check manually"
                    company["Link"] = (segmentSoup.find("a")["href"])
                    try:
                        company["Anteil"] = str(
                            re.findall("\d+\,\d+",
                                       segment[0])[0].strip()) + "\%"
                    except:
                        company["Anteil"] = "NA"

                    # Get more detailed information for the company
                    if company["Link"] not in visitedLinks:
                        visitedLinks.append(company["Link"])
                        page = mySoupFuncs.getSoup(company["Link"])
                        company = getDetailedInfo(page, company, visitedLinks,
                                                  depth + 1, depthLimit)

                    if segment[1] == u"Gesellschafter":
                        stakeholders["Companies"]["Gesellschafter"][
                            company["Name"]] = company
                    elif segment[1] == u"Aktionär":
                        stakeholders["Companies"][u"Aktionär"][
                            company["Name"]] = company
                except:
                    print "Couldnt retrieve stakeholder. Data with no link"

        # For people
        elif "Einzelperson" in segment[0]:
            person = dict()
            # Get the name
            stringSection = repr(segmentSoup.getText())
            # If someone has a name with Herrn or Frau
            if "Herrn" in segment[0]:
                start = stringSection.index("Herrn")
                end = stringSection[start:].index("\\r")
                person["Name"] = (stringSection[start:start + end])
            elif "Frau" in segment[0]:
                start = stringSection.index("Frau")
                end = stringSection[start:].index("\\r")
                person["Name"] = (stringSection[start:start + end])
            else:
                start = segment[0].index("<br/>") + 5
                end = segment[0][start:].index("<br/>")
                person["Name"] = (stringSection[start:start + end]).strip()

            # Get link to person and more detailed information
            if segmentSoup.find("a"):
                person["Link"] = segmentSoup.find("a")["href"]
                if person["Link"] not in visitedLinks:
                    visitedLinks.append(person["Link"])
                    person = Crawler_person.getPersonInfo(
                        person, visitedLinks, depth, depthLimit)

            # Try to get anteil
            try:
                anteil = str(re.findall("\d+\,\d+",
                                        segment[0])[0].strip()) + "\%"
                person["Anteil"] = anteil
            except:
                pass

            # Append to the relevant list
            if segment[1] == u"Gesellschafter":
                stakeholders["Persons"]["Gesellschafter"][
                    person["Name"]] = person
            elif segment[1] == u"Aktionär":
                stakeholders["Persons"][u"Aktionär"][person["Name"]] = person
            else:
                print "NEW PERSON TYPE"

    return stakeholders

Python segmentByTerms примеры использования