def write_to_file(file_name, companies, summary_columns, wb_sheet): """ This method generates a new Excel file with the name <file_name> that will contains the data found in PreSeries for each entry in the original excel file :param file_name: the name of the file to be generated :param companies: a list of companies with the basic data found in PreSeries :param summary_columns: the columns in the original file that will be used in the new file to give more information about the companies. :param wb_sheet: the excel sheet of the original file where we will find the summary fields of the companies. """ workbook = Workbook() companies_sheet = workbook.add_sheet('Companies') # Build the header names header = ["Original Row", "Company Name", "Domain", "Country"] header.extend(summary_columns) \ if summary_columns and len(summary_columns) > 0 else None [companies_sheet.write(0, index, value) for index, value in enumerate(header)] for index, company_data in enumerate(companies): companies_sheet.write(1 + index, 0, company_data["row"]) companies_sheet.write( 1 + index, 1, company_data["name"].decode('utf-8', 'ignore') if "name" in company_data else "") companies_sheet.write( 1 + index, 2, company_data["country_code"] if "country_code" in company_data else "") companies_sheet.write( 1 + index, 3, company_data["domain"] if "domain" in company_data else "") for index2, summary_column in enumerate(summary_columns): try: columnvalue = wb_sheet.cell_value( company_data["row"], PreSeriesUtils.excel2num(summary_column)).encode('cp1252') except UnicodeEncodeError: columnvalue = wb_sheet.cell_value( company_data["row"], PreSeriesUtils.excel2num(summary_column)).encode('utf-8') pass companies_sheet.write(1 + index, 4 + index2, columnvalue.decode('utf-8', 'ignore')) workbook.save(file_name)
def dump_similar_objects(similar_by_company): """ This methods generates s CSV-like version of the similar companies of each company """ # These are the basic fields of the similar that we want to export headers = [ 'Company ID'.encode('utf8'), 'Company Name'.encode('utf8'), 'Similar Company Id'.encode('utf8'), 'Similar Company Name'.encode('utf8'), 'Similar Company Score'.encode('utf8'), 'Distance Btw Companies'.encode('utf8'), 'Max Distance in Cluster'.encode('utf8'), 'Similarity'.encode('utf8')] fields = [ "company_id", "company_name", "similar_company_id", "similar_company_name", "similar_company_score", "distance", "max_distance", "similarity" ] resources = [] [resources.extend(similar) for similar in similar_by_company.values()] return PreSeriesUtils.dump_opbjects(headers, fields, resources)
def dump_rounds_objects(founders): """ This methods generates s CSV-like version of the Company objects, a list of rows with columns """ # These are the basic fields of the companies that we want to export headers = [ 'Company ID'.encode('utf8'), 'Company Name'.encode('utf8'), 'Stage Name'.encode('utf8'), 'Date'.encode('utf8'), 'Funding Type'.encode('utf8'), 'Series'.encode('utf8'), 'Amount'.encode('utf8')] fields = [ "company_id", "company_name", "stage", "date", "funding_type", "series", "amount" ] return PreSeriesUtils.dump_opbjects(headers, fields, founders)
def dump_stages_objects(founders): """ This methods generates s CSV-like version of the Company stages, a list of rows with columns """ # These are the basic fields of the companies that we want to export headers = [ 'Company ID'.encode('utf8'), 'Company Name'.encode('utf8'), 'Stage Name'.encode('utf8'), 'Start Date'.encode('utf8'), 'End Date'.encode('utf8'), 'First Round Date'.encode('utf8'), 'Last Round Date'.encode('utf8'), 'Total Investment'.encode('utf8'), 'Total Rounds'.encode('utf8')] fields = [ "company_id", "company_name", "stage", "start_date", "end_date", "first_round_date", "last_round_date", "investment_amount", "total_rounds" ] return PreSeriesUtils.dump_opbjects(headers, fields, founders)
def dump_person_objects(founders): """ This methods generates s CSV-like version of the Company persons, a list of rows with columns """ # These are the basic fields of the companies that we want to export headers = [ 'Company ID'.encode('utf8'), 'Company Name'.encode('utf8'), 'PreSeries ID'.encode('utf8'), 'Firstname'.encode('utf8'), 'Lastname'.encode('utf8'), 'Crunchbase URL'.encode('utf8'), 'Crunchbase Id'.encode('utf8'), 'LinkedIn URL'.encode('utf8'), 'Facebook URL'.encode('utf8'), 'Twitter URL'.encode('utf8'), 'Google+ URL'.encode('utf8'), 'Gender'.encode('utf8'), 'Birthdate'.encode('utf8'), 'Updated on'.encode('utf8')] fields = [ "company_id", "company_name", "person_id", "first_name", "last_name", "crunchbase_url", "crunchbase_uuid", "linkedin_url", "facebook_url", "twitter_url", "google_plus_url", "gender", "born", "updated", ] return PreSeriesUtils.dump_opbjects(headers, fields, founders)
def read_search_data_from_excel(self, file_name, column_id=None, column_name=None, column_country=None, column_domain=None, skip_rows=False): """ This method is responsible for extract from an Excel file all the companies we will need to find in PreSeries. for build the query parameters that we are going to use to look for the companies in PreSeries informed in an Excel file. The query string will have only the id criteria or the name of the company if the id is not informed. The domain and country_code won't be used in the query, we will use them later for select the best match from all the candidates that matched the query. :return: a list where each row is one company which contains a tuple with two items, the query string to look in preseries for the company and the map with all the parameters used in the query """ logging.debug("Looking for the first sheet in the Excel.") wb = open_workbook(file_name) first_sheet = wb.sheets()[0] logging.debug("Sheet name [%s]." % first_sheet.name) self.companies_query = [] for row in range(skip_rows, first_sheet.nrows): logging.debug("Processing row: %d" % row) if column_id: company_id = first_sheet.cell_value( row, PreSeriesUtils.excel2num(column_id)) self.companies_query.append(("id=%s" % company_id, { "row": row, "id": company_id })) continue query_string = {} query_params = {"row": row} if column_name and \ first_sheet.cell_value( row, PreSeriesUtils.excel2num(column_name)): try: company_name = first_sheet.cell_value( row, PreSeriesUtils.excel2num(column_name)).encode('cp1252') except UnicodeEncodeError: company_name = first_sheet.cell_value( row, PreSeriesUtils.excel2num(column_name)).encode('utf-8') pass query_string['name__icontains'] = company_name query_params["name"] = company_name if column_domain: company_domain = PreSeriesUtils.resolve_domain( first_sheet.cell_value( row, PreSeriesUtils.excel2num(column_domain))) if company_domain: # We only use the domain after the search to select the # best candidate query_params["domain"] = company_domain if column_country and \ first_sheet.cell_value( row, PreSeriesUtils.excel2num(column_country)): country_code = PreSeriesUtils.resolve_country( first_sheet.cell_value( row, PreSeriesUtils.excel2num(column_country))) if country_code: # We only use the country_code after the search to # select the best candidate query_params['country_code'] = country_code self.companies_query.append( (urllib.urlencode(query_string), query_params))
def search_companies(self): """ We are going to get all the Companies from PreSeries using the search url calculated for each Company. We use the internal field "companies_query" to prepare the search. This property has a list of tuples, where each tuple contains the following information: - the "query string" to do the REST query - the "company_details" as a map with all the field-values of the company we want to look for in PreSeries. Ex. query_string = name__icontains=prese company_details = { "name": "PreSeries", "country_code": "ESP", "domain": "preseries.com" } The query string could, and should, not use all the company properties in the query to be more flexible. For instance, we can only prepare query strings using the "name" property to get from PreSeries as much companies as possible, to use later all the other properties (country_code, domain, etc) to decide which company is more likely to be the company we are looking for. :return: the companies found and the ones that were not found """ found_companies = [] unknown_companies = [] for query_string, company_details in self.companies_query: # We download a maximum of 100 companies from the total that # matches the search criteria (limit=100) query = "limit=100&%s" % query_string logging.debug("Query: %s" % query) resp = self.api.search_companies(query_string=query) # We get multiple companies as a response. if resp['meta']['total_count'] > 1: best_candidate = PreSeriesUtils.select_best_company( company_details, resp['objects']) logging.warn("More than one match!\n" "Params: %s \n" "Selected candidate: %s" % (company_details, best_candidate)) company_data = {"row": company_details["row"]} company_data.update(best_candidate) found_companies.append( PreSeriesUtils.encoding_conversion(company_data)) elif resp['meta']['total_count'] == 0: logging.warn("Unknown company: %s" % company_details) unknown_companies.append(company_details) else: company_data = {"row": company_details["row"]} company_data.update(resp["objects"][0]) found_companies.append( PreSeriesUtils.encoding_conversion(company_data)) return found_companies, unknown_companies
def dump_company_objects(companies_details): """ This methods generates s CSV-like version of the Company objects, a list of rows with columns """ # These are the basic fields of the companies that we want to export headers = [ 'PreSeries ID'.encode('utf8'), 'Name'.encode('utf8'), 'Elevator Pitch'.encode('utf8'), 'Foundation date'.encode('utf8'), 'Domain'.encode('utf8'), 'Status'.encode('utf8'), 'Country'.encode('utf8'), 'City'.encode('utf8'), 'Stage'.encode('utf8'), 'Areas'.encode('utf8'), 'Top Area'.encode('utf8'), 'Headcount'.encode('utf8'), 'Num of Founders'.encode('utf8'), 'Locations'.encode('utf8'), 'Diversification'.encode('utf8'), 'Funding rounds'.encode('utf8'), 'Total Funding'.encode('utf8'), 'First funding on'.encode('utf8'), 'Days to first funding'.encode('utf8'), 'Last funding on'.encode('utf8'), 'Days since last funding'.encode('utf8'), 'Num of MBAs'.encode('utf8'), 'Num of PhDs'.encode('utf8'), 'Num of patents first year'.encode('utf8'), 'Num of patents last year'.encode('utf8'), 'Twitter bio'.encode('utf8'), 'Twitter followers'.encode('utf8'), 'Twitter following'.encode('utf8'), 'Twitter tweets'.encode('utf8'), 'Twitter url'.encode('utf8'), 'Crunchbase url'.encode('utf8'), 'LinkedIn url'.encode('utf8'), 'Facebook url'.encode('utf8'), 'Google Plus url'.encode('utf8'), 'IPO %'.encode('utf8'), 'Acquired %'.encode('utf8'), 'Defunct %'.encode('utf8'), 'Ratio - Influencer'.encode('utf8'), 'Ratio - Traction'.encode('utf8'), 'Country Rank'.encode('utf8'), 'Country Rank Change'.encode('utf8'), 'Country Rank Percentile'.encode('utf8'), 'Country Rank Percentile Change'.encode('utf8'), 'Area Rank'.encode('utf8'), 'Area Rank Change'.encode('utf8'), 'Area Rank Percentile'.encode('utf8'), 'Area Rank Percentile Change'.encode('utf8'), 'World Rank'.encode('utf8'), 'World Rank Change'.encode('utf8'), 'World Rank Percentile'.encode('utf8'), 'World Rank Percentile Change'.encode('utf8'), 'Score'.encode('utf8'), 'Score Change'.encode('utf8'), 'Tracked from'.encode('utf8'), 'Updated on'.encode('utf8')] fields = [ "company_id", "name", "company/elevator_pitch", "foundation_date", "domain", "status", "country_code", "city", "stage", "areas", "top_area", "headcount", "num_of_cofounders", "locations_list", "diversity_list", "funding_count", "funding_sum", "first_funding_on", "days_to_first_funding", "last_funding_on", "days_since_last_funding", "num_of_mbas", "num_of_phds", "num_patents_1st_year", "num_patents_on_exit_0", "twitter_bio", "twitter_followers", "twitter_following", "twitter_tweets", "twitter_url", "company/crunchbase_url", "company/linkedin_url", "company/facebook_url", "company/googleplus_url", "transition_ipo", "transition_acquired", "transition_defunct", "ratio_influencer", "ratio_traction", "country_rank", "country_rank_change", "country_rank_percentile", "country_rank_percentile_change", "area_rank", "area_rank_change", "area_rank_percentile", "area_rank_percentile_change", "world_rank", "world_rank_change", "world_rank_percentile", "world_rank_percentile_change", "score", "score_change", "tracked_from", "updated_on", ] return PreSeriesUtils.dump_opbjects(headers, fields, companies_details)