Пример #1
0
    def execute_parser(self, rootDir):
        r_cnt = 0
        for dirName, subdirList, fileList in os.walk(rootDir):
            if dirName == 'data':
                pass
            else:
                dirpath = os.path.abspath(dirName)
                for root, dirs, filenames in os.walk(dirpath):

                    for f in filenames:
                        os.chdir(dirpath)
                        fn = f.replace('\xc3\xab','e')
                        os.rename(f, f.replace('\xc3\xab','e'))
                        if 'docx' in f:
                            print('Docx file: %s ' % f )
                            results = self.get_docx_text(fn)
                            document = self.get_lxml_content(fn)
                            data_handler = self.parse_data(results, f , document)
                            r_cnt += data_handler['counter']
                            print "---------------------------------------"
                            #print "%d total = %d events + 1 header" % (len(data_handler['docs']), data_handler['counter'] - 1)
                            print "\n"
                            mongo_utils.insert(data_handler['docs'])
                        else:
                            pass

                    retva = os.getcwd()
                    os.chdir("..")
                    os.chdir("..")
                print('Found directory: %s ' % dirName )
        print "%d events imported into database." % r_cnt
Пример #2
0
def submit_entry():
    req = request.json

    extracted = tldextract.extract(req['url'])
    domain = "{}.{}".format(extracted.domain, extracted.suffix)

    doc = {
        'url': req['url'],
        'domain': domain,
        'chromeUserId': req['chrome_user_id'],
        'text': req['text'],
        'timestamp': datetime.fromtimestamp(req['date'] / 1e3),
        "classification": req['classification'],
    }

    if "version" in req:
        doc["version"] = req['version']

    mongo_utils.insert(doc)
    return Response(status=200)
Пример #3
0
def submit_entry():
    req = request.json

    extracted = tldextract.extract(req['url'])
    domain = "{}.{}".format(extracted.domain, extracted.suffix)

    doc = {
        'url': req['url'],
        'domain': domain,
        'chromeUserId': req['chrome_user_id'],
        'text': req['text'],
        'timestamp': datetime.fromtimestamp(req['date'] / 1e3),
        "classification": req['classification'],
    }

    if "version" in req:
        doc["version"] = req['version']
        
    mongo_utils.insert(doc)
    return Response(status=200)
Пример #4
0
    def parse(self):
        with open('importer/csv_importer/classified_data.csv', 'rb') as csvfile:
            reader = csv.reader(csvfile)
            # Skip header
            next(reader)

            for row in reader:
                region = row[0]
                datetime_string = row[2]
                description = row[3].strip()
                institutions = row[4]
                category = row[18].title()

                institutions_json = json.loads(institutions)

                datetime_obj = datetime.strptime(datetime_string, '%Y-%m-%dT%H:%M:%S.%fZ')

                date_str = datetime_string[0:10] + 'T00:00:00.000Z'
                date_obj = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%fZ')

                time_str = '2000-01-01T' + datetime_string[11:]
                time_obj = datetime.strptime(time_str, '%Y-%m-%dT%H:%M:%S.%fZ')

                doc = {
                    "description": description,
                    "geoLocated": False,
                    "region": region,
                    "datetime": datetime_obj,
                    "date": date_obj,
                    "time": time_obj,
                    "notEnoughDetails": False,
                    "institutions": institutions_json,
                    "category": category
                }

                print description
                mongo_utils.insert(doc)