Пример #1
0
def date(xentry, name, parsed):
    """ insert a date-formated element into the entry """
    if not parsed: return
    formatted = time.strftime("%Y-%m-%dT%H:%M:%SZ", parsed)
    xdate = createTextElement(xentry, name, formatted)
    formatted = time.strftime(config.date_format(), parsed)
    xdate.setAttribute('planet:format', formatted.decode('utf-8'))
Пример #2
0
def date(xentry, name, parsed):
    """ insert a date-formated element into the entry """
    if not parsed: return
    formatted = time.strftime("%Y-%m-%dT%H:%M:%SZ", parsed)
    xdate = createTextElement(xentry, name, formatted)
    formatted = time.strftime(config.date_format(), parsed)
    xdate.setAttribute('planet:format', formatted.decode('utf-8'))
Пример #3
0
def bridger_crawling(para, full_name, operator, case_name):
    """
    :param para: Company name
    :param full_name: Full name of current user
    :return: Evidence if found, otherwise NOT_FOUND
    """
    logger.info("BridgerInsight Helper: " + str(para) + " " + str(full_name))
    keyword = para
    # MARK
    para = para.replace('/', '_')

    username, password = config.get_bridger_account(operator)
    
    evidence = dict()
    url = bridger_url(username, password, para)
    if url!="unicode detected!!!!!!":

        crawler = None
        try:
            req = urllib2.Request(url)
            res = urllib2.urlopen(req)
            crawler = res.read()
        except urllib2.URLError, e:
            logger.error("Error while crawling: " + str(e))
            if hasattr(e, "code"):
                logger.error("Code: " + str(e.code))
            if hasattr(e, "reason"):
                logger.error("Reason: " + str(e.reason))
            return NOT_FOUND

        if crawler == NOT_FOUND:
            logger.warning(NOT_FOUND)
            return NOT_FOUND

        # logger.info("BridgerInsight Result: " + str(crawler))
        config.mkdirs(config.local_bridge_folder + config.date_format(config.get_today()))
        name_orig = para.replace('/','_')
        para = (case_name + '_' + para).replace('/', '_')
        item=json.loads(crawler)
        if item['Records'] is None or item["Records"]["ResultRecord"][0]["Watchlist"]["Matches"] is None:
            bridger_no_result(operator, name_orig, bridger_pdf_local(para))
            evidence["type"] = "bridgerInsight"
            evidence["url"] = config.current_url + bridger_pdf_web(para)
            evidence["missing"] = False
            evidence["name"] = keyword
            logger.info("Evidence: " + str(evidence))
            return evidence
        
        item['full name'] = operator
        crawler = json.dumps(item)
       
        name = para.replace(" ", "_")
        fileObject = open(name + "_bridge.json", 'w')
        fileObject.write(str(crawler))
        fileObject.close()
        exists = bridger_insight_parser(name + "_bridge.json", name + "_bridge_temp.json")
        if not exists:
            return NOT_FOUND

        print "pdf target======"
        print bridger_pdf_local(name)

        try:
            buildPdf(bridger_pdf_local(name), name + "_bridge_temp.json")
            # buildPdf(bridger_pdf_target(name, case_name), name + "_bridge_temp.json")
            if (os.path.isfile(bridger_pdf_local(name))):
                print "File exists and will start copy BRIDGERINSIGHT!!!!"
                shutil.copy(bridger_pdf_local(name), bridger_pdf_target(name, case_name))
        except:
            logger.error("Failed to generate PDF for:" + str(name))
            logger.error(traceback.format_exc())
            return NOT_FOUND

        abstract_data = []
        hit_flag = False
        with open(name + "_bridge.json") as data_file:
            ab_data = json.load(data_file)
            for temp_item in ab_data['Records']['ResultRecord']:
                wl = temp_item['Watchlist']['Matches']['WLMatch']
                for index, wl_item in enumerate(wl):
                    if wl_item['EntityDetails'] is None and wl_item["CountryDetails"] is not None:
                        entry = {}
                        entry['EntityScore'] = wl_item['BestCountryScore']
                        if int(entry['EntityScore']) == 100:
                            hit_flag = True
                        entry['FileName'] = wl_item['File']['Name'][:-4]
                        entry['Entity Name'] = wl_item['EntityName']
                        entry["index"] = index + 1
                        abstract_data.append(entry)
                    elif wl_item["EntityDetails"] is not None and wl_item["CountryDetails"] is None:
                        entry = {}
                        entry['EntityScore'] = wl_item['EntityScore']
                        if int(entry['EntityScore']) == 100:
                            hit_flag = True
                        entry['FileName'] = wl_item['File']['Name'][:-4]
                        entry['Entity Name'] = wl_item['EntityName']
                        entry["index"] = index + 1
                        abstract_data.append(entry)


        target_json_folder = config.target_folder + case_name + "/bridgerJson"
        # 1 Create a folder which will contain JSON files for every user.
        if not os.path.exists(target_json_folder):
            os.makedirs(target_json_folder)

        # 2 Check whether there is such JSON file that matches bocID.json
        if os.path.isfile(bridger_temp_json_target(name, case_name)) and os.access(bridger_temp_json_target(name, case_name),
                                                                          os.R_OK):
            print "File exists and is readable"
        else:
            print "Either file is missing or is not readable"
            os.mknod(bridger_temp_json_target(name, case_name))
            print "Will check whether json exists"
            print os.path.isfile(bridger_temp_json_target(name, case_name)) and os.access(bridger_temp_json_target(name, case_name), os.R_OK)

        # 3 Write userInfo into that JSON

        with open(bridger_temp_json_target(name, case_name), "w") as fileObject_1:
            json.dump(abstract_data, fileObject_1, indent=4, ensure_ascii=False)


        if (os.path.isfile(name + "_bridge_temp.json")):
            print "Will COPY original Bridger data."
            shutil.copy((name + "_bridge_temp.json"), bridger_json_target(name, case_name))

        # os.system("rm " + name + "_bridge.json " + name + "_bridge_temp.json")
        os.remove(name + "_bridge.json")
        os.remove(name + "_bridge_temp.json")
        evidence["type"] = "bridgerInsight"
        evidence["url"] = config.current_url + bridger_pdf_web(name)
        evidence["missing"] = False
        evidence["name"] = keyword
        evidence["isHit"] = hit_flag
        logger.info("Evidence: " + str(evidence))
        return evidence
Пример #4
0
def dowjones_crawling(item, operator, case_name):
    """
    :param item: Company name
    :return: List of PDF URL
    """
    # item = urllib.quote(item.encode('utf8'), ':/')
    logger.info("Dowjones item: " + str(item))

    username, password = config.get_dowjones_account(operator)
    # Mark
    item = item.replace('/', '_')
    craw_str = dowjones_url(username, password, item)
    if craw_str!="unicode detected!!!!!!":
        print craw_str
        logger.debug(craw_str)

        name_item = config.filename_format(item)
    ################
        # stored_file = config.local_dowjones_folder + config.date_format(config.get_today()) + "/" + name_item + ".pdf"
        # if os.path.exists(stored_file):
            # logger.debug("Downloaded PDF File Already Exists!")
            # entry_file = config.current_url + config.web_dowjones_folder + config.date_format(config.get_today()) + "/" + name_item + ".pdf"
            # return [entry_file]
    ##########
        stored_file = config.local_dowjones_folder + config.date_format(config.get_today()) + "/" + name_item + "_Dowjones_summary.pdf"
        result = []
    #    if os.path.exists(stored_file):
    #        logger.debug("Downloaded PDF File Already Exists!")
    #        entry_file = config.current_url + config.web_dowjones_folder + config.date_format(config.get_today()) + "/" + name_item + "_Dowjones_summary.pdf"
    #
    #        pdf_local_path = config.local_dowjones_folder + config.date_format(config.get_today()) + "/"
    #
    #        entry_path = config.current_url + config.web_dowjones_folder + config.date_format(config.get_today()) + "/" 
    #
    #        result.append(entry_file)
    #        
    #        pdf_file = os.listdir(pdf_local_path)
    #        for file in pdf_file:
    #            if name_item + "_companyDetail_" in file and file.startswith(name_item):
    #                result.append(entry_path+file)
    #        return result


        crawler = None
        try:
            req = urllib2.Request(craw_str)
            res = urllib2.urlopen(req)
            crawler = res.read()
        except urllib2.URLError, e:
            logger.error("Error while crawling: " + str(e))
            if hasattr(e, "code"):
                logger.error("Code: " + str(e.code))
            if hasattr(e, "reason"):
                logger.error("Reason: " + str(e.reason))
            return NOT_FOUND

        if not crawler or crawler == NOT_FOUND:
            logger.warning(NOT_FOUND)
            return NOT_FOUND
        print "DowJones Result", crawler
        
        crawler = json.loads(crawler)
        if str(crawler['urls'][0]) == '':
            logger.warning(NOT_FOUND)
            return NOT_FOUND  
        result = []
        # if "found" in crawler and crawler["found"]:
        logger.debug("Dow Jones Crawling Result: " + str(crawler))
        for craw_item in crawler['urls']:
            name = craw_item['link']
            name = name.split('/')
            name = name[len(name) - 1]
            crawler_path = config.crawler_url + config.dowjones_pdf_folder + name
            local_dir = config.local_dowjones_folder + config.date_format(config.get_today())
            config.mkdirs(local_dir)
            # local = local_dir + "/" + name_item + ".pdf"
            local = local_dir + "/" + name
            target_folder = config.target_folder + case_name + "/originalEvidence"
            print "target_folder========"
            print target_folder
            if not os.path.exists(target_folder):
                print "target_folder doesnt exist!!"
                os.makedirs(target_folder)
            target = target_folder + "/" + name

            logger.info("remote path: " + crawler_path)
            logger.info("local path: " + local)

            crawler_path = urllib.quote(crawler_path.encode('utf8'), ':/')
            urllib.urlretrieve(crawler_path, local)

            # urllib.urlretrieve(crawler_path, target)
            if (os.path.isfile(local)):
                print "File exists and will start copy DOWJONES"
                shutil.copy(local, target)

            ret_path = config.current_url + config.web_dowjones_folder + config.date_format(config.get_today()) + "/" + name
            # ret_path = config.current_url + config.web_dowjones_folder + config.date_format(config.get_today()) + "/" + name_item + ".pdf"
            logger.info("Return Path" + ret_path)
            temp = ret_path
            result.append({'origin': craw_item['origin'], 'link': temp})
        # if not crawler["found"]:
        #             break
        # else:
        #     logger.warning(NOT_FOUND)
        #     return NOT_FOUND

        logger.info("Dowjones Result: " + str(result))
        return result
Пример #5
0
def bridger_pdf_web(para):
    return config.web_bridge_folder + config.date_format(config.get_today()) + "/" + config.create_pdf_name(para)
Пример #6
0
        if hasattr(e, "reason"):
            logger.error("Reason: " + str(e.reason))
        return NOT_FOUND

    if crawler is None or crawler == NOT_FOUND:
        logger.warning("No crawler results: " + str(crawler_url))
        return NOT_FOUND

    logger.debug("Google Crawler Result: " + str(crawler))

    store_path = config.local_google_folder
    res_folder = os.path.exists(store_path)
    if not res_folder:
        os.system('mkdir ' + store_path)
    out_pdf = item.replace(' ', '_') + '.pdf'
    local = config.local_google_folder + config.date_format(config.get_today()) + "/"
    config.mkdirs(local)
    local = local + out_pdf
    target_folder = config.target_folder + case_name + "/originalEvidence"
    print "target_folder========"
    print target_folder
    if not os.path.exists(target_folder):
        print "target_folder doesnt exist!!"
        os.makedirs(target_folder)
    target = target_folder + "/" + out_pdf

    full_url = config.crawler_url + config.google_pdf_folder + craw_pdf
    logger.info("Crawling: " + full_url + " " + local)
    logger.info("Crawling:================ " + full_url + " " + target)
    urllib.urlretrieve(full_url, local)
    # urllib.urlretrieve(full_url, target)