/
parse.py
38 lines (38 loc) · 1.76 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#! /usr/bin/python
import string;
import re;
import urllib;
from indextank.client import ApiClient;
indexapi = ApiClient('http://:7Xwa4xmDZZsyFG@d6nwu.api.indextank.com');
index = indexapi.get_index('techcrunch');
f = open("companies.txt");
fw = open("intermediate.txt", 'w');
i = 0;
for line in f :
p = re.compile(r'{"name":');
list = p.split(line.rstrip('"\n'));
if len(list) > 1:
url = "http://api.crunchbase.com/v/1/company/" + list[1].rstrip('",').lstrip(' "')+".js" ;
print url;
print "Company:" + list[1].rstrip('",').lstrip(' "');
companyid = list[1].rstrip('",').lstrip(' "');
fw.write("Company:" + list[1].rstrip('",').lstrip(' "')+" ");
page = urllib.urlopen(url);
buf = page.read();
if re.search("error",buf) :
fw.write("\n");
continue;
ext = re.compile('("homepage_url"):(.*)');
homepage =ext.search(buf).group(2).strip(' "|",')+"\n";
fw.write(ext.search(buf).group(1).strip('"') + ":" + ext.search(buf).group(2).strip(' "|",')+" ");
jobs = ext.search(buf).group(2).rstrip(',"')+"/jobs";
ext = re.compile('("category_code"):(.*)');
fw.write(ext.search(buf).group(1).strip('"') + ":" + ext.search(buf).group(2).strip(' "|",')+" ");
category = ext.search(buf).group(2).strip(' "|",');
ext = re.compile('("number_of_employees"):(.*)');
fw.write(ext.search(buf).group(1).strip('"') + ":" + ext.search(buf).group(2).strip(' "|",')+"\n");
num_emp = ext.search(buf).group(2).strip(' "|",');
print category;
index.add_document(i, { 'text': category, "company":companyid, "homepage_url":homepage, "category_code":category, "number_of_employees":num_emp});
i = i + 1;
f.close();