def execute_parser(self, rootDir): r_cnt = 0 for dirName, subdirList, fileList in os.walk(rootDir): if dirName == 'data': pass else: dirpath = os.path.abspath(dirName) for root, dirs, filenames in os.walk(dirpath): for f in filenames: os.chdir(dirpath) fn = f.replace('\xc3\xab','e') os.rename(f, f.replace('\xc3\xab','e')) if 'docx' in f: print('Docx file: %s ' % f ) results = self.get_docx_text(fn) document = self.get_lxml_content(fn) data_handler = self.parse_data(results, f , document) r_cnt += data_handler['counter'] print "---------------------------------------" #print "%d total = %d events + 1 header" % (len(data_handler['docs']), data_handler['counter'] - 1) print "\n" mongo_utils.insert(data_handler['docs']) else: pass retva = os.getcwd() os.chdir("..") os.chdir("..") print('Found directory: %s ' % dirName ) print "%d events imported into database." % r_cnt
def submit_entry(): req = request.json extracted = tldextract.extract(req['url']) domain = "{}.{}".format(extracted.domain, extracted.suffix) doc = { 'url': req['url'], 'domain': domain, 'chromeUserId': req['chrome_user_id'], 'text': req['text'], 'timestamp': datetime.fromtimestamp(req['date'] / 1e3), "classification": req['classification'], } if "version" in req: doc["version"] = req['version'] mongo_utils.insert(doc) return Response(status=200)
def parse(self): with open('importer/csv_importer/classified_data.csv', 'rb') as csvfile: reader = csv.reader(csvfile) # Skip header next(reader) for row in reader: region = row[0] datetime_string = row[2] description = row[3].strip() institutions = row[4] category = row[18].title() institutions_json = json.loads(institutions) datetime_obj = datetime.strptime(datetime_string, '%Y-%m-%dT%H:%M:%S.%fZ') date_str = datetime_string[0:10] + 'T00:00:00.000Z' date_obj = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%fZ') time_str = '2000-01-01T' + datetime_string[11:] time_obj = datetime.strptime(time_str, '%Y-%m-%dT%H:%M:%S.%fZ') doc = { "description": description, "geoLocated": False, "region": region, "datetime": datetime_obj, "date": date_obj, "time": time_obj, "notEnoughDetails": False, "institutions": institutions_json, "category": category } print description mongo_utils.insert(doc)