예제 #1
0
def train_website_novelty_scores(restimate_only=True):
    for year in range(2003, 2019):
        print("\n\n" + "*" * 50)
        print("***  \tStarting year {0} ".format(year))
        print("*" * 50 + "\n\n")

        estimator = similarity_estimator()

        if not restimate_only:
            #Then, reload from underlying data

            pq_startups = data_reader.read_preqin()
            pq_startups = pq_startups[pq_startups.incyear == year]
            pq_startups['year'] = None
            pq_startups['path'] = "../../out/"
            pq_startups = pq_startups[['website', 'year', 'path', 'incyear']]
            pq_startups["type"] = "startup"
            pq_startups["source"] = "preqin"

            cb_startups = data_reader.read_crunchbase()
            cb_startups['incyear'] = cb_startups.founding_year
            cb_startups = cb_startups[cb_startups.incyear == year]
            cb_startups['year'] = None
            cb_startups['path'] = "../../out/"
            cb_startups = cb_startups[['website', 'year', 'path', 'incyear']]
            cb_startups["type"] = "startup"
            cb_startups['source'] = "crunchbase"

            public_firms = data_reader.read_public_companies()
            public_firms = public_firms[public_firms.ipoyear <= year]
            public_firms['year'] = year
            public_firms['path'] = "../../out_public/"
            public_firms = public_firms[['website', 'year', 'path']]
            public_firms['type'] = "public_firm"
            public_firms['source'] = "orbis"

            all_websites = pq_startups.append(cb_startups).append(public_firms)

            estimator.load_train(all_websites)
            estimator.prepare_train_documents()
            estimator.train()
        else:
            print("No new estimates, loading old models")
            estimator.load_model("../../tfidf/{0}".format(year))

        estimator.estimate_similarities()
        estimator.store_model("../../tfidf/{0}".format(year))
def train_HP_industries(restimate_only=False):
    estimator = HP_industries_estimator()

    if not restimate_only:
        #Loads all startups from all cohorts, and no public firms
        cb_startups = data_reader.read_crunchbase()
        cb_startups['incyear'] = cb_startups.founding_year
        cb_startups['year'] = None
        cb_startups['path'] = "../../out/"
        cb_startups = cb_startups[['website', 'year', 'path', 'incyear']]
        cb_startups["type"] = "startup"
        cb_startups['source'] = "crunchbase"

        estimator.load_train(cb_startups)
        estimator.prepare_train_documents()
        estimator.train()
    else:
        estimator.load_model("../../tfidf/hp_industries")

    estimator.estimate_industries()
    estimator.store_model("../../tfidf/hp_industries")
from json.decoder import JSONDecodeError

###############################################
##
##
##   This script goes to the Wayback Machine to get the closest website link
##    for all websitse in Crunchbase.
##
##
##
##
##
##

websites = data_reader.read_crunchbase()
websites[['closest_snapshot']] = ""
websites[['closest_snapshot_time']] = ""

for index, company in websites.iterrows():
    crawler = waybackmachine_crawler(company['website'])
    year = company['founding_year'] + 1

    try:
        closest_snapshot = crawler.list_closest_snapshot(year, 1, 1)
    except JSONDecodeError:
        print("\n\n*********JSONDecodeError************")

    if closest_snapshot is not None:
        websites.at[index, 'closest_snapshot'] = str(closest_snapshot)
        websites.at[index,
import pdb
import os
sys.path.append(os.path.abspath('../download'))
sys.path.append(os.path.abspath('../text_analysis'))
from data_reader import data_reader
from website_text_dataset import website_text_dataset
from langdetect import detect

import argparse


for year in range(2003, 2019):

    print("Reading data for year {0}".format(year))
    
    cb_startups = data_reader.read_crunchbase()
    cb_startups['incyear'] = cb_startups.founding_year
    cb_startups = cb_startups[cb_startups.incyear == year]
    cb_startups['year'] = None
    cb_startups['path'] = "../../out/"
    cb_startups = cb_startups[['website','year','path','incyear']]
    cb_startups["type"] = "startup"
    cb_startups['source'] = "crunchbase"

    public_firms = data_reader.read_public_companies()
    public_firms = public_firms[public_firms.ipoyear <= year]
    public_firms['year'] = year
    public_firms['path'] = "../../out_public/"
    public_firms = public_firms[['website','year','path']]
    public_firms['type'] = "public_firm"
    public_firms['source'] = "orbis"