def train_website_novelty_scores(restimate_only=True): for year in range(2003, 2019): print("\n\n" + "*" * 50) print("*** \tStarting year {0} ".format(year)) print("*" * 50 + "\n\n") estimator = similarity_estimator() if not restimate_only: #Then, reload from underlying data pq_startups = data_reader.read_preqin() pq_startups = pq_startups[pq_startups.incyear == year] pq_startups['year'] = None pq_startups['path'] = "../../out/" pq_startups = pq_startups[['website', 'year', 'path', 'incyear']] pq_startups["type"] = "startup" pq_startups["source"] = "preqin" cb_startups = data_reader.read_crunchbase() cb_startups['incyear'] = cb_startups.founding_year cb_startups = cb_startups[cb_startups.incyear == year] cb_startups['year'] = None cb_startups['path'] = "../../out/" cb_startups = cb_startups[['website', 'year', 'path', 'incyear']] cb_startups["type"] = "startup" cb_startups['source'] = "crunchbase" public_firms = data_reader.read_public_companies() public_firms = public_firms[public_firms.ipoyear <= year] public_firms['year'] = year public_firms['path'] = "../../out_public/" public_firms = public_firms[['website', 'year', 'path']] public_firms['type'] = "public_firm" public_firms['source'] = "orbis" all_websites = pq_startups.append(cb_startups).append(public_firms) estimator.load_train(all_websites) estimator.prepare_train_documents() estimator.train() else: print("No new estimates, loading old models") estimator.load_model("../../tfidf/{0}".format(year)) estimator.estimate_similarities() estimator.store_model("../../tfidf/{0}".format(year))
def train_HP_industries(restimate_only=False): estimator = HP_industries_estimator() if not restimate_only: #Loads all startups from all cohorts, and no public firms cb_startups = data_reader.read_crunchbase() cb_startups['incyear'] = cb_startups.founding_year cb_startups['year'] = None cb_startups['path'] = "../../out/" cb_startups = cb_startups[['website', 'year', 'path', 'incyear']] cb_startups["type"] = "startup" cb_startups['source'] = "crunchbase" estimator.load_train(cb_startups) estimator.prepare_train_documents() estimator.train() else: estimator.load_model("../../tfidf/hp_industries") estimator.estimate_industries() estimator.store_model("../../tfidf/hp_industries")
from json.decoder import JSONDecodeError ############################################### ## ## ## This script goes to the Wayback Machine to get the closest website link ## for all websitse in Crunchbase. ## ## ## ## ## ## websites = data_reader.read_crunchbase() websites[['closest_snapshot']] = "" websites[['closest_snapshot_time']] = "" for index, company in websites.iterrows(): crawler = waybackmachine_crawler(company['website']) year = company['founding_year'] + 1 try: closest_snapshot = crawler.list_closest_snapshot(year, 1, 1) except JSONDecodeError: print("\n\n*********JSONDecodeError************") if closest_snapshot is not None: websites.at[index, 'closest_snapshot'] = str(closest_snapshot) websites.at[index,
import pdb import os sys.path.append(os.path.abspath('../download')) sys.path.append(os.path.abspath('../text_analysis')) from data_reader import data_reader from website_text_dataset import website_text_dataset from langdetect import detect import argparse for year in range(2003, 2019): print("Reading data for year {0}".format(year)) cb_startups = data_reader.read_crunchbase() cb_startups['incyear'] = cb_startups.founding_year cb_startups = cb_startups[cb_startups.incyear == year] cb_startups['year'] = None cb_startups['path'] = "../../out/" cb_startups = cb_startups[['website','year','path','incyear']] cb_startups["type"] = "startup" cb_startups['source'] = "crunchbase" public_firms = data_reader.read_public_companies() public_firms = public_firms[public_firms.ipoyear <= year] public_firms['year'] = year public_firms['path'] = "../../out_public/" public_firms = public_firms[['website','year','path']] public_firms['type'] = "public_firm" public_firms['source'] = "orbis"