def getAssociations(companyPage, name, visitedLinks, depth, depthLimit): #Set default value of our lists to NA employees = "NA" stakeholders = "NA" investments = "NA" # Get the relevant blue box section crefoSoup = companyPage.find("div", id="crefo") # Split between handelnde personen and beteiligungen terms = [ "Handelnde Personen:", "Anteilseigner:", "Beteiligungen von " + name.strip() ] segments = mySoupFuncs.segmentByTerms(crefoSoup, terms) #If we have results if segments != "NA": for i in range(len(segments)): soup = mySoupFuncs.toSoup(segments[i][0]) if segments[i][1] == "Handelnde Personen:": employees = getEmployeeInfo(soup, visitedLinks, depth, depthLimit) elif segments[i][1] == "Anteilseigner:": stakeholders = getStakeholders(soup, visitedLinks, depth, depthLimit) elif segments[i][1] == "Beteiligungen von " + name.strip(): investments = getInvestments(soup, visitedLinks, depth, depthLimit) createCompanyEdges(stakeholders, investments, employees, name) return stakeholders, investments, employees
def getEmployeeInfo(soup, visitedLinks, depth, depthLimit): # Data Structures we will reuturn employees = dict() fuehrer = dict() aufsichtsrat = dict() vorstand = dict() # Get info for handelnde personen first terms = [u"Vorstand", u"Geschäftsführer", u"Aufsichtsrat"] segments = mySoupFuncs.segmentByTerms(soup, terms) if segments == "NA": return "NA" # Retrieve the relevant information from the data structures for segment in segments: segmentSoup = mySoupFuncs.toSoup(segment[0]) person = dict() # Get the name stringSection = repr(segmentSoup.getText()) #try: # If someone has a name with Herrn or Frau if "Herrn" in segment[0]: start = stringSection.index("Herrn") end = stringSection[start:].index("\\r") person["Name"] = (stringSection[start:start + end]) elif "Frau" in segment[0]: start = stringSection.index("Frau") end = stringSection[start:].index("\\r") person["Name"] = (stringSection[start:start + end]) else: start = segment[0].index("<br/>") + 5 end = segment[0][start:].index("<br/>") person["Name"] = (stringSection[start:start + end]).strip() # Get link to person and more detailed information if segmentSoup.find("a"): person["Link"] = segmentSoup.find("a")["href"] if person["Link"] not in visitedLinks: visitedLinks.append(person["Link"]) person = Crawler_person.getPersonInfo(person, visitedLinks, depth, depthLimit) # Append to the relevant dict if segment[1] == "Vorstand": vorstand[person["Name"]] = person elif segment[1] == "Aufsichtsrat": aufsichtsrat[person["Name"]] = person elif segment[1] == u"Geschäftsführer": fuehrer[person["Name"]] = person else: print "NEW PERSON TYPE" #except: # print "FAILURE TO GET PERSON",segment #Add the new data to existing dict employees[u"Geschäftsführer"] = fuehrer employees[u"Aufsichtsrat"] = aufsichtsrat employees[u"Vorstand"] = vorstand return employees
def getInvestments(soup, visitedLinks, depth, depthLimit): # Data Structures investments = dict() gesellschafter = dict() aktionaer = dict() # Get segments for investments terms = [u"Gesellschafter", u"Aktionär"] segments = mySoupFuncs.segmentByTerms(soup, terms) if segments == "NA": return "NA" # Retrieve the relevant information from the data structures for segment in segments: segmentSoup = mySoupFuncs.toSoup(segment[0]) # If we have a foreign company company = dict() if "(Ausland)" in segment[0]: company["Name"] = "Failure: Ausland" else: try: #Get the basic company information company["Name"] = (segmentSoup.find("a").getText().strip()) company["Link"] = (segmentSoup.find("a")["href"]) try: company["Anteil"] = str( re.findall("\d+\,\d+", segment[0])[0].strip()) + "\%" except: company["Anteil"] = "NA" #Get more detailed information for the company if company["Link"] not in visitedLinks: visitedLinks.append(company["Link"]) page = mySoupFuncs.getSoup(company["Link"]) company = getDetailedInfo(page, company, visitedLinks, depth + 1, depthLimit) #Add to the relevant section if segment[1] == u"Gesellschafter": gesellschafter[company["Name"]] = company elif segment[1] == u"Aktionär": aktionaer[company["Name"]] = company else: print "NEW COMPANY TYPE" except: print "Couldnt retrieve investments. Data with no link." investments["Gesellschafter"] = gesellschafter investments["Aktionär"] = aktionaer return investments
def getPersonInfo(personDict, visitedCompanies, depth, depthLimit): uniqueID = str(random.randrange(100000000)) nodes.append(personDict["Name"], uniqueID) person = personDict person["Investments"] = dict() person["Functions"] = dict() url = personDict["Link"] soup = mySoupFuncs.getSoup(url) body = soup.find("div", class_="modal-body") sections = mySoupFuncs.segmentByTerms(body, ["<h2 class=\"h4\">"]) if sections == "NA": return personDict #Now get beteiligungen and funktionen for section in sections: section = mySoupFuncs.toSoup(section[0]) if "Beteiligungen" in section.getText(): result = section.find_all("a", class_="blue") for subsection in result: company = dict() company["Name"] = subsection.getText().replace( "\r", " ").replace("\n", " ").strip() company["Link"] = subsection["href"] if company["Link"] not in visitedCompanies: edges.append((personDict["Name"], company["Name"])) visitedCompanies.append(company["Link"]) page = mySoupFuncs.getSoup(company["Link"]) person["Investments"][ company["Name"]] = Crawler_companies.getDetailedInfo( page, company, visitedCompanies, depth + 1, depthLimit) elif "Funktionen" in section.getText(): result = section.find_all("a", class_="blue") for subsection in result: company = dict() company["Name"] = subsection.getText().replace( "\r", " ").replace("\n", " ").strip() company["Link"] = subsection["href"] if company["Link"] not in visitedCompanies: edges.append((personDict["Name"], company["Name"])) visitedCompanies.append(company["Link"]) page = mySoupFuncs.getSoup(company["Link"]) person["Functions"][ company["Name"]] = Crawler_companies.getDetailedInfo( page, company, visitedCompanies, depth + 1, depthLimit) #Now we return the result return person
def getCompanySections(pages, terms, where): # Get the relevant segments relevantSegments = list() companySections = list() count = 1 for page in pages: segments = mySoupFuncs.segmentByTerms(page, terms) for segment in segments: if where.lower() in mySoupFuncs.toSoup( segment[0]).span.next_sibling.next_sibling.string.lower(): relevantSegments.append(segment[0]) for segment in relevantSegments: # Get all the company sections from this section sections = mySoupFuncs.toSoup(segment).find_all("li", class_="card result") for section in sections: companySections.append(section) return companySections
def getStakeholders(soup, visitedLinks, depth, depthLimit): # Data Structures we will reuturn stakeholders = dict() stakeholders["Persons"] = dict() stakeholders["Companies"] = dict() stakeholders["Persons"]["Gesellschafter"] = dict() stakeholders["Persons"][u"Aktionär"] = dict() stakeholders["Companies"]["Gesellschafter"] = dict() stakeholders["Companies"][u"Aktionär"] = dict() # Get info for handelnde personen first terms = [u"Gesellschafter", u"Aktionär"] segments = mySoupFuncs.segmentByTerms(soup, terms) if segments == "NA": return "NA" # Retrieve the relevant information from the data structures for segment in segments: segmentSoup = mySoupFuncs.toSoup(segment[0]) # For companies if "Einzelperson" not in segment[0]: # If we have a foreign company company = dict() if "(Ausland)" in segment[0]: company["Name"] = "Failure: Ausland" else: try: # Get the basic company information company["Name"] = (segmentSoup.find("a").getText().strip()) company["Name"] = "Bad data. Please check manually" company["Link"] = (segmentSoup.find("a")["href"]) try: company["Anteil"] = str( re.findall("\d+\,\d+", segment[0])[0].strip()) + "\%" except: company["Anteil"] = "NA" # Get more detailed information for the company if company["Link"] not in visitedLinks: visitedLinks.append(company["Link"]) page = mySoupFuncs.getSoup(company["Link"]) company = getDetailedInfo(page, company, visitedLinks, depth + 1, depthLimit) if segment[1] == u"Gesellschafter": stakeholders["Companies"]["Gesellschafter"][ company["Name"]] = company elif segment[1] == u"Aktionär": stakeholders["Companies"][u"Aktionär"][ company["Name"]] = company except: print "Couldnt retrieve stakeholder. Data with no link" # For people elif "Einzelperson" in segment[0]: person = dict() # Get the name stringSection = repr(segmentSoup.getText()) # If someone has a name with Herrn or Frau if "Herrn" in segment[0]: start = stringSection.index("Herrn") end = stringSection[start:].index("\\r") person["Name"] = (stringSection[start:start + end]) elif "Frau" in segment[0]: start = stringSection.index("Frau") end = stringSection[start:].index("\\r") person["Name"] = (stringSection[start:start + end]) else: start = segment[0].index("<br/>") + 5 end = segment[0][start:].index("<br/>") person["Name"] = (stringSection[start:start + end]).strip() # Get link to person and more detailed information if segmentSoup.find("a"): person["Link"] = segmentSoup.find("a")["href"] if person["Link"] not in visitedLinks: visitedLinks.append(person["Link"]) person = Crawler_person.getPersonInfo( person, visitedLinks, depth, depthLimit) # Try to get anteil try: anteil = str(re.findall("\d+\,\d+", segment[0])[0].strip()) + "\%" person["Anteil"] = anteil except: pass # Append to the relevant list if segment[1] == u"Gesellschafter": stakeholders["Persons"]["Gesellschafter"][ person["Name"]] = person elif segment[1] == u"Aktionär": stakeholders["Persons"][u"Aktionär"][person["Name"]] = person else: print "NEW PERSON TYPE" return stakeholders