示例#1
0
def get_chinese_similarity(s1, s2):
    """
    Get the similarity of two chinese word
    """
    hash1 = simhash([ smart_unicode(x) for x in seg_txt(smart_str(s1)) ])
    hash2 = simhash([ smart_unicode(x) for x in seg_txt(smart_str(s2)) ])
    return hash1.similarity(hash2)
示例#2
0
def compute_similarities(text, models, count=None):
    """Finds items that are similar to the specified text
    :param text: The text to be used for comparison
    :param models: The list of models to be compared against text
                   Each of the entries should have a simhash property
    :param count: The no. of similar items to return
    """
    # Get the simhash of the submitted message
    _hash = simhash(util.unicodeToAscii(text))
    candidates, scores = {}, []

    # TODO: Investiage ways of speeding this - complexity is O(n)
    for model in models:
        target = simhash(hash=long(model.simhash))
        if long(target) == long(_hash):
            continue
        similarity = _hash.similarity(target)
        if similarity >= similarity_threshold:
            scores.append((model.id, similarity))
            candidates[model.id] = model
    if len(scores) == 0:
        return []

    scores.sort(key=lambda x: x[1], reverse=True)
    result_size = max_similar_messages if count is None else count

    retval = []
    for x in range(result_size):
        message_dict = candidates[scores[x][0]].as_dict()
        del message_dict['simhash']
        message_dict['score'] = scores[x][1]
        retval.append(message_dict)
    return retval
示例#3
0
def compute_similarities(text, models, count=None):
    """Finds items that are similar to the specified text
    :param text: The text to be used for comparison
    :param models: The list of models to be compared against text
                   Each of the entries should have a simhash property
    :param count: The no. of similar items to return
    """
    # Get the simhash of the submitted message
    _hash = simhash(util.unicodeToAscii(text))
    candidates, scores = {}, []

    # TODO: Investiage ways of speeding this - complexity is O(n)
    for model in models:
        target = simhash(hash=long(model.simhash))
        if long(target) == long(_hash):
            continue
        similarity = _hash.similarity(target)
        if similarity >= similarity_threshold:
            scores.append((model.id, similarity))
            candidates[model.id] = model
    if len(scores) == 0:
        return []

    scores.sort(key=lambda x: x[1], reverse=True)
    result_size = max_similar_messages if count is None else count

    retval = []
    for x in range(result_size):
        message_dict = candidates[scores[x][0]].as_dict()
        del message_dict['simhash']
        message_dict['score'] = scores[x][1]
        retval.append(message_dict)
    return retval
示例#4
0
 def is_similar_page(self, page1, page2):
     hash1 = simhash(page1)
     hash2 = simhash(page2)
     similar = hash1.similarity(hash2)
     if similar > 0.85:  # 当前阈值定义为0.85
         return True
     else:
         return False
示例#5
0
def get_simhash(shingles1,
                shingles2,
                simhash_bytes=simhash_bytes,
                hashfunc=hashfunc):
    sim1 = simhash(shingles1, hashbits=simhash_bytes)
    sim2 = simhash(shingles2, hashbits=simhash_bytes)

    return sim1.similarity(sim2)
示例#6
0
def simhash_hamming_corrcoef(A, B):
    str_A = []
    str_B = []
    for i in xrange(len(A)):
        str_A.append(str(A[i]))
        str_B.append(str(B[i]))
    hash_A = simhash(','.join(str_A))
    hash_B = simhash(','.join(str_B))
    return hash_A.similarity(hash_B)
示例#7
0
    def _get_diff_ratio(self, a_str, b_str):
        '''
        '''
        if a_str == None or b_str == None:
            return 0

        a_hash = simhash(a_str.split())
        b_hash = simhash(b_str.split())
        ratio = a_hash.similarity(b_hash)
        return ratio
示例#8
0
def get_simhash(sentence1, sentence2):
    hash1 = simhash(get_cut_sentence(sentence1))
    hash2 = simhash(get_cut_sentence(sentence2))
    # print(hash1)
    # print(hash2)
    similarity = hash1.similarity(hash2)
    print(similarity)
    if similarity > 0.8:
        return similarity
    else:
        return False
示例#9
0
文件: fuzzy.py 项目: motord/banter
def decide(forktionary, message):
    message_hash = simhash(message.split())
    similarity = 0
    val = None
    for key, value in forktionary.iteritems():
        key_hash = memcache.get(key)
        if not key_hash:
            key_hash = simhash(key.split())
            memcache.set(key, key_hash)
            sim = message_hash.similarity(key_hash)
            if sim > similarity:
                similarity = sim
                val = value
    return val
示例#10
0
    def run(self) -> dict:
        sim_hash_dict = dict()
        # tqdm 是一个显示进度条的python库
        # print(self.message_list)
        # print(type(self.df))
        for idx, value in self.df.iterrows():
            # hashbits,比较的hash位数
            # print(value)
            sim = simhash(value['Content'], hashbits=self.hashbits)
            sim_dict = dict(message=value['Content'],
                            simhash=sim,
                            LineId=value['LineId'])
            if sim.hash in sim_hash_dict.keys():
                sim_list = sim_hash_dict[sim.hash]
                if self.keep_same_count == 0 or self.keep_same_count <= len(
                        sim_list):
                    sim_list.append(sim_dict)
                else:
                    print("已经达到分组保存容量的最大值,跳过词条记录")
            else:
                sim_list = list()
                sim_list.append(sim_dict)
                sim_hash_dict[sim.hash] = sim_list
        total_group = len(sim_hash_dict.keys())
        print('After Simhash Reduce, total:%s bin(s)' %
              len(sim_hash_dict.keys()))

        print("数据压缩比率为:%s" % (1 - total_group / len(self.df)))
        return sim_hash_dict
示例#11
0
 def fit(self, X, y=None):
     texts = self._preprocess(X[self.field])
     print 'DEBUG: preprocess done'
     self.hashcodes = map(lambda s: simhash(s), texts)
     self.y = np.asarray(y)
     print 'DEBUG: fit done...'
     return self
示例#12
0
    def __call__(self, dict_):
        text = dict_['text']
        self_simhash = simhash(text)

        fuzzy_count = 0
        sum_other_votes = 0
        for other_simhash in self.simhashes:
            if self_simhash.similarity(other_simhash) > self.threshold:
                # increment the votes of the others
                other_votes = self.votes[other_simhash.hash] = self.votes.get(
                    other_simhash.hash, 1) + 1
                fuzzy_count += 1
                sum_other_votes += other_votes

        # should self.votes be elevated based on fuzzy_count?
        self.votes[self_simhash.hash] = self.votes.get(self_simhash.hash,
                                                       0) + 1

        # maybe normalize based on the number of total votes?
        dict_['fuzzy_count'] = fuzzy_count
        dict_['fuzzy_votes'] = sum_other_votes

        # store simhash in global state now that we've finished processing
        self.simhashes.append(self_simhash)
        return dict_
示例#13
0
def get_feature_from_content(content):
    """
    Generate features from a HTTP content
    Returns: a dict contains features, format is:
            <name_of_feature>:value_of_feature
    """
    feature_dict = {}
    """
    tl=re.findall("<title.*?\/title>",c)
    kw=re.findall('(?<=<meta name="keywords" content=").*(?=\/>)',c)
    tmpl=re.findall('(?<=<meta name="generator" content=").*(?=\/>)',c)
    uid=re.findall("UA-\d{5,10}-\d{1,4}",c)
    dm=re.findall("(?<=_gaq.push\(\['_setDomainName'),.*(?=\]\);)",c)
    tl="" if tl==[] else tl[0]
    kw="" if kw==[] else kw[0]
    tmpl="" if tmpl==[] else tmpl[0]
    uid="" if uid==[] else uid[0]
    dm="" if dm==[] else dm[0]
    """

    feature_dict["ctitle"] = re.findall("<title.*?\/title>", content)
    feature_dict["ckws"] = re.findall('(?<=<meta name="keywords" content=").*(?=\/>)', content)
    feature_dict["ctmpl"] = re.findall('(?<=<meta name="generator" content=").*(?=\/>)', content)
    feature_dict["gid"] = re.findall("UA-\d{5,10}-\d{1,4}", content)
    feature_dict["dm"] = re.findall("(?<=_gaq.push\(\['_setDomainName'),.*(?=\]\);)", content)

    for k in feature_dict:
        feature_dict[k] = "" if not feature_dict[k] else feature_dict[k][0]

    feature_dict["chash"] = str(simhash(content))
    content = "".join(content.split())
    feature_dict["clen"] = len(content)
    return feature_dict
示例#14
0
文件: db.py 项目: themoep/elsim
    def percentages(self, vmx, threshold=10):
        elems_hash = set()

        signature_module = sign.Signature(vmx)

        for _cls in vmx.get_classes():
            if _cls.is_external():
                continue
            _class = _cls.get_vm_class()

            for method in _class.get_methods():
                code = method.get_code()
                if code is None:
                    continue
                # FIXME: shouldnt here not apply the same rules as on import?
                # Like skip constructors and too short methods?
                for i in signature_module.get_method_signature(
                        method, predef_sign=sign.PredefinedSignature.
                        SEQUENCE_BB).get_list():
                    elems_hash.add(int(simhash(i)))

        ret, info = self.db.elems_are_presents(elems_hash)
        sorted_ret = self._eval_res(ret, info, threshold)

        info = defaultdict(list)

        for k, values in sorted_ret.items():
            for j in sorted(values, key=itemgetter(1), reverse=True):
                info[k].append([j[0], j[1]])

        return info
示例#15
0
def add_message(deployment_id):
    """Adds a new message for the deployment in :deployment_id

    The input parameters are:
        message: string

    :param deployment_id: the id of the deployment
    """
    if not request.json:
        abort(400)
    _post = request.json
    if 'origin_message_id' not in _post and 'content' not in _post:
        abort(400)

    # Does the deployment exist
    deployment = Deployment.by_id(deployment_id)
    if deployment is None:
        abort(404)

    _hash = simhash(util.unicodeToAscii(_post['content']))
    message = Message(deployment_id=deployment_id,
                      origin_message_id=_post['origin_message_id'],
                      content=_post['content'],
                      simhash=str(_hash))
    message.create()
    return jsonify(message.as_dict())
示例#16
0
def hash_column_rows(column):
    '''
    Input:
    column <Series>  one column from pandas data frame

    Output:
    [] hash <str> list of hash strings as the column representation
    '''

    # column.apply(hash)
    # collect/cluster similar hashes into baskets
    # baskets = []
    # for row in column:
    #     # hash only the first row
    #     row_value = str(row).strip()
    #     row_hash = hash_func.compute_hash(row_value)
    #     basket_item = row_value
    #     if basket_item not in baskets:
    #         baskets.append(basket_item)
    # all fields are identical -> single basket for the column
    # if len(baskets) == 1:
    #     return baskets[0]
    # else:
    baskets = simhash(column)
    return baskets
示例#17
0
def add_message(deployment_id):
    """Adds a new message for the deployment in :deployment_id

    The input parameters are:
        message: string

    :param deployment_id: the id of the deployment
    """
    if not request.json:
        abort(400)
    _post = request.json
    if 'origin_message_id' not in _post and 'content' not in _post:
        abort(400)

    # Does the deployment exist
    deployment = Deployment.by_id(deployment_id)
    if deployment is None:
        abort(404)

    _hash = simhash(util.unicodeToAscii(_post['content']))
    message = Message(deployment_id=deployment_id,
                      origin_message_id=_post['origin_message_id'],
                      content=_post['content'],
                      simhash=str(_hash))
    message.create()
    return jsonify(message.as_dict())
示例#18
0
def hash_column_rows(column):
    '''
    Input:
    column <Series>  one column from pandas data frame

    Return:
    [] hash <str> list of hash strings as the column representation
    '''

    # column.apply(hash)
    # collect/cluster similar hashes into baskets
    # baskets = []
    # for row in column:
    #     # hash only the first row
    #     row_value = str(row).strip()
    #     row_hash = hash_func.compute_hash(row_value)
    #     basket_item = row_value
    #     if basket_item not in baskets:
    #         baskets.append(basket_item)
    # all fields are identical -> single basket for the column
    # if len(baskets) == 1:
    #     return baskets[0]
    # else:
    baskets = simhash(column)
    return baskets
示例#19
0
 def transform(self, X):
     texts = self._preprocess(X[self.field])
     hashcodes = map(lambda s: simhash(s), texts)
     print 'DEBUG: finish transform hashing'
     #similarity_indices = map(lambda h: self._find_neighbor(h), hashcodes)
     similarity_indices = Parallel(n_jobs=-1)(delayed(Simhash_find_neighbor)(self.hashcodes, h) 
         for h in hashcodes)
     return self.y[similarity_indices].reshape(-1, 1)
示例#20
0
文件: db.py 项目: themoep/elsim
    def add(self,
            dx,
            name,
            sname,
            regexp_pattern=None,
            regexp_exclude_pattern=None):
        """
        Add all classes which match certain rules to the database.

        Only methods with a length >= 50 are added.
        No constructor (static and normal) methods are added.

        Additional exlcludes or whitelists can be defined by classnames
        as regexes.
        
        :param androguard.core.analysis.analysis.Analysis dx:
        :param str name: name, the first key in the tree
        :param str sname: subname, the second key in the tree
        :param str regexp_pattern: whitelist regex pattern
        :param str regexp_exclude_pattern: blacklist regex pattern
        """
        sign_module = sign.Signature(dx)

        for _cls in dx.get_classes():
            if _cls.is_external():
                continue
            _class = _cls.get_vm_class()

            # whitelist
            if regexp_pattern and not re.match(regexp_pattern,
                                               _class.get_name()):
                continue

            # blacklist
            if regexp_exclude_pattern and re.match(regexp_exclude_pattern,
                                                   _class.get_name()):
                continue

            print("\tadding", _class.get_name())
            for method in _class.get_methods():
                code = method.get_code()
                if not code or method.get_length() < 50 or method.get_name(
                ) in ("<clinit>", "<init>"):
                    continue

                buff_list = sign_module.get_method_signature(
                    method,
                    predef_sign=sign.PredefinedSignature.SEQUENCE_BB).get_list(
                    )
                if len(set(buff_list)) == 1:
                    continue

                for e in buff_list:
                    self.db.add_element(name, sname, str(_class.get_name()),
                                        method.get_length(), int(simhash(e)))
示例#21
0
def process_source_html(html_src_file, visit_info):
    if isfile(html_src_file):
        visit_info.html_src_file = html_src_file
        html_src = open(html_src_file).read()
        visit_info.html_src_size = len(html_src)
        visit_info.fx_conn_error = is_moz_error_txt(html_src)
        visit_info.html_src_hash = mmh3.hash(html_src)
        visit_info.html_src_simhash = simhash(html_src)
        if not visit_info.fx_conn_error:
            visit_info.page_title = get_page_title_from_html_src(html_src)
            populate_site_generator(html_src, visit_info)
示例#22
0
def doit():
    n = 0

    for i in  coll.find():
        h = simhash(i['firstName'] + i['lastName'], hashbits=bits)
        ha = bitstring.pack('uint:%s' % bits, long(h))
        for j in xrange(0, bits):
            if limit != None and n > limit:
                return
            #i['h%s' % j] = ha.hex
            #print ha.hex
            print '%s:%s' % (ha.uint, i['n'])
            ha.ror(bits / bits)

            n += 1
示例#23
0
    def computeSimilarities(self, msg):
        """
        returns a set of message id's with similarity score, sorted by
        similarity score.

        I recommend using >=0.875 to define 'near-dup'.
        :return [('1', 0.9), ('2', 0.8), ...], sorted by the real value
                (second element of each item) in decreasing order.
        """
        simhashCode = simhash(unicodeToAscii(msg['description']))

        retList = []
        for i in range(len(self._simhashList)):
            id = self._simhashList[i][0]
            val = self._simhashList[i][1].similarity(simhashCode)
            retList.append((id, val))

        retList.sort(key=lambda x: x[1], reverse=True)
        return retList
示例#24
0
    def computeSimilarities(self, msg):
        """
        returns a set of message id's with similarity score, sorted by
        similarity score.

        I recommend using >=0.875 to define 'near-dup'.
        :return [('1', 0.9), ('2', 0.8), ...], sorted by the real value
                (second element of each item) in decreasing order.
        """
        simhashCode = simhash(unicodeToAscii(msg['description']))

        retList = []
        for i in range(len(self._simhashList)):
            id = self._simhashList[i][0]
            val = self._simhashList[i][1].similarity(simhashCode)
            retList.append((id, val))

        retList.sort(key=lambda x: x[1], reverse=True)
        return retList
示例#25
0
def is_similar(title):
    db = redis.StrictRedis(host=get_dbhost(), port=get_dbport(), db=0)
    result = jieba.analyse.extract_tags(title, topK=10)
    hash1 = simhash(result)
    title_md5 = cal_md5(title)
    title_set = set()
    for word in result:
        title_set = title_set | db.smembers(word)
    if not title_set:
        insert_title(result, title_md5, hash1)
        return False
    for every_md5_title in title_set:
        hash_list = db.smembers(every_md5_title)
        hash2 = hash1
        for i in hash_list:
            hash2 = int(i)
            break
        if hamming_distance(hash1, hash2) < 3:
            return True
    insert_title(result, title_md5, hash1)
    return False
示例#26
0
    def train(self, messageList):
        """Takes list of messages. each message is a dictionary.
        """
        assert False, "Not implemented yet"

        #--- save to _messages, and compute their hashcodes for simhash
        self._messageMap = dict(((v['id'], v) for v in messageList))
        self._simhashList = [(v['id'],
                              simhash(unicodeToAscii(v['description'])))
                             for v in messageList]

        #        self._messages = map(lambda x: x['description'], messageList);
        # self._simhashes = map(lambda x:
        # simhash(unicodeToAscii(x['description'])), messageList);

        #--- collect category list
        categorySet = set()
        for msg in messageList:
            categorySet.update(msg['categories'])

        #--- update categories
        categories = sorted(list(categorySet))

        #--- train classifiers
        minFreq = 5
        # 1
        unigramExtractor = DssgUnigramExtractor()

        def dssgVectorizerGenerator():
            return DssgVectorizerUnigramCount(unigramExtractor, minFreq)

        def dssgBinaryClassifierTrainer(train):
            return DssgBinaryClassifierSVC.train(train,
                                                 dssgVectorizerGenerator(),
                                                 balance=False)

        categoryClassifier = DssgCategoryClassifier.train(
            dssgBinaryClassifierTrainer, messageList, dssgVectorizerGenerator)
        self._categoryClassifier = categoryClassifier
        return self
示例#27
0
    def train(self, messageList):
        """Takes list of messages. each message is a dictionary.
        """
        assert False, "Not implemented yet"

        #--- save to _messages, and compute their hashcodes for simhash
        self._messageMap = dict(((v['id'], v) for v in messageList))
        self._simhashList = [(v['id'], simhash(unicodeToAscii(
            v['description']))) for v in messageList]

#        self._messages = map(lambda x: x['description'], messageList);
# self._simhashes = map(lambda x:
# simhash(unicodeToAscii(x['description'])), messageList);

        #--- collect category list
        categorySet = set()
        for msg in messageList:
            categorySet.update(msg['categories'])

        #--- update categories
        categories = sorted(list(categorySet))

        #--- train classifiers
        minFreq = 5
        # 1
        unigramExtractor = DssgUnigramExtractor()

        def dssgVectorizerGenerator():
            return DssgVectorizerUnigramCount(unigramExtractor, minFreq)

        def dssgBinaryClassifierTrainer(train):
            return DssgBinaryClassifierSVC.train(
                train, dssgVectorizerGenerator(), balance=False)

        categoryClassifier = DssgCategoryClassifier.train(
            dssgBinaryClassifierTrainer, messageList, dssgVectorizerGenerator)
        self._categoryClassifier = categoryClassifier
        return self
示例#28
0
文件: similar.py 项目: Priya22/tweedr
    def __call__(self, dict_):
        text = dict_['text']
        self_simhash = simhash(text)

        fuzzy_count = 0
        sum_other_votes = 0
        for other_simhash in self.simhashes:
            if self_simhash.similarity(other_simhash) > self.threshold:
                # increment the votes of the others
                other_votes = self.votes[other_simhash.hash] = self.votes.get(other_simhash.hash, 1) + 1
                fuzzy_count += 1
                sum_other_votes += other_votes

        # should self.votes be elevated based on fuzzy_count?
        self.votes[self_simhash.hash] = self.votes.get(self_simhash.hash, 0) + 1

        # maybe normalize based on the number of total votes?
        dict_['fuzzy_count'] = fuzzy_count
        dict_['fuzzy_votes'] = sum_other_votes

        # store simhash in global state now that we've finished processing
        self.simhashes.append(self_simhash)
        return dict_
示例#29
0
    def process_item(self, item, spider):
        #,item['time']
        pinyin_content = ' '.join(
            [i for i in lazy_pinyin(item['content'].strip()) if i.strip()])
        pinyin_content = str(pinyin_content)
        #hashbits = simhash(item['content'], hashbits=32).hex()
        hashbits = simhash(pinyin_content, hashbits=64).hex()
        bitstr = str(bin(int(hashbits, 16)))[3:]
        for i in range(64 - len(bitstr)):
            bitstr = '0' + bitstr

        print bitstr, len(bitstr)

        # manage duplicate url
        if [i for i in JobModel.where(url=item['url']).select()]:
            return item

    # manage duplicate content,using google simhash
        if has_dup(bitstr, item['time'], item['url']):
            return item

        insert_dup(bitstr, item['url'], item['time'])

        jobitem = JobModel()
        jobitem.title = item['title']
        jobitem.url = item['url']
        jobitem.email = item['email']
        jobitem.content = item['content']
        jobitem.time = item['time']
        jobitem.type = item['type']
        # jobitem.tags = item['tags']
        tagobj = Tag()
        itemtag = tagobj.gettag(item['content'], item['title'])

        jobtag = 'nontec'
        for tag in itemtag.split("\t"):
            if u'技术' in tag:
                jobtag = 'tec'
        jobitem.jobtag = jobtag
        jobitem.tags = str(itemtag)
        jobitem.save()

        # create tag if necessary
        tags = []
        for tag in itemtag.split("\t"):
            tagmodel = TagModel.where(tag=tag).select()
            if not [i for i in tagmodel]:
                tagmodel = TagModel()
                tagmodel.tag = tag
                tagmodel.save()

            tagmodel = TagModel.where(tag=tag).select()
            for i in tagmodel:
                tags.append(int(i.id))

    # save the tags
        for tag in tags:
            jobtagobj = JobCrossTagModel()
            jobtagobj.url = item['url']
            jobtagobj.tagid = tag
            jobtagobj.type = item['type']
            jobtagobj.time = item['time']
            jobtagobj.save()

        return item
示例#30
0
def add_report(deployment_id):
    """Adds a new report to the deployment specified by the ``deployment_id``
    parameter

    Input parameters:
        description: string - Description of the report
        categories: array of integers - category ids

    :param deployment_id: the id of the deployment
    """
    verify_deployment(deployment_id)
    errors = {}
    _post = request.json
    # Check for fields
    if 'origin_report_id' not in _post:
        errors['origin_report_id'] = 'The report id is missing'
    if 'title' not in _post:
        errors['title'] = 'The report title is missing'
    if 'description' not in _post:
        errors['description'] = 'The report description is missing'
    if 'categories' not in _post or len(_post['categories']) == 0:
        errors['categories'] = 'The report categories must be specified'

    # Did we encounter any errors?
    if len(errors) > 0:
        app.logger.error("There are some errors in the request %r" % errors)
        abort(400)

    # Does the specified report already exist?
    _report = db.session.query(Report).\
        filter(Report.origin_report_id == _post['origin_report_id'],
               Report.deployment_id == deployment_id).first()

    if not _report is None:
        app.logger.error("The report %s has already been registered" %
                         _post['origin_report_id'])
        abort(400)

    # Get the categories
    categories = db.session.query(Category).\
        filter(Category.deployment_id == deployment_id,
               Category.origin_category_id.in_(_post['categories'])).all()

    # Have the specified category ids been registered?
    if len(categories) == 0:
        app.logger.error("The specified categories are invalid")
        abort(400)

    # Compute the simhash on the report description
    _hash = simhash(util.unicodeToAscii(_post['description']))
    report = Report(deployment_id=deployment_id,
                    origin_report_id=_post['origin_report_id'],
                    title=_post['title'],
                    description=_post['description'],
                    simhash=str(_hash))
    # Create the report
    report.create()

    # Save the report categories
    report_categories = []
    for category in categories:
        rc = ReportCategory(report_id=report.id, category_id=category.id)
        report_categories.append(rc)
    ReportCategory.create_all(report_categories)

    return jsonify(report.as_dict())
示例#31
0
def add_report(deployment_id):
    """Adds a new report to the deployment specified by the ``deployment_id``
    parameter

    Input parameters:
        description: string - Description of the report
        categories: array of integers - category ids

    :param deployment_id: the id of the deployment
    """
    verify_deployment(deployment_id)
    errors = {}
    _post = request.json
    # Check for fields
    if 'origin_report_id' not in _post:
        errors['origin_report_id'] = 'The report id is missing'
    if 'title' not in _post:
        errors['title'] = 'The report title is missing'
    if 'description' not in _post:
        errors['description'] = 'The report description is missing'
    if 'categories' not in _post or len(_post['categories']) == 0:
        errors['categories'] = 'The report categories must be specified'

    # Did we encounter any errors?
    if len(errors) > 0:
        app.logger.error("There are some errors in the request %r" % errors)
        abort(400)

    # Does the specified report already exist?
    _report = db.session.query(Report).\
        filter(Report.origin_report_id == _post['origin_report_id'],
               Report.deployment_id == deployment_id).first()

    if not _report is None:
        app.logger.error("The report %s has already been registered" %
                         _post['origin_report_id'])
        abort(400)

    # Get the categories
    categories = db.session.query(Category).\
        filter(Category.deployment_id == deployment_id,
               Category.origin_category_id.in_(_post['categories'])).all()

    # Have the specified category ids been registered?
    if len(categories) == 0:
        app.logger.error("The specified categories are invalid")
        abort(400)

    # Compute the simhash on the report description
    _hash = simhash(util.unicodeToAscii(_post['description']))
    report = Report(deployment_id=deployment_id,
                    origin_report_id=_post['origin_report_id'],
                    title=_post['title'],
                    description=_post['description'],
                    simhash=str(_hash))
    # Create the report
    report.create()

    # Save the report categories
    report_categories = []
    for category in categories:
        rc = ReportCategory(report_id=report.id, category_id=category.id)
        report_categories.append(rc)
    ReportCategory.create_all(report_categories)

    return jsonify(report.as_dict())
示例#32
0
#!/usr/bin/env python

import pymongo
import bitstring

from hashes.simhash import simhash


connection = pymongo.Connection("localhost", 27017)

db = connection.pace
coll = db.people

for i in  coll.find():
    h = simhash(i['firstName'] + i['lastName'], hashbits=64)
    
    ha = bitstring.pack('uint:64', long(h))
    for j in xrange(0, 8):
        i['h%s' % j] = ha.hex
        #print ha.hex
        ha.ror(64 / 8)

    coll.update({'n': i['n']}, i)
示例#33
0
 def compute_hash(self, text):
     return simhash(text, hashbits=self.hashbits)  #.hex()
示例#34
0
# -*- coding:utf-8 -*-
# @author:Eric Luo
# @file:py-hash0.1.py
# @time:2017/5/24 0024 16:31

from hashes.simhash import simhash
hash1 = simhash('This is a test string one.')
hash2 = simhash('This is a test string TWO.')
print(hash1,hash2)
示例#35
0
def get_similarity(s1, s2):
    """
    Get the similarity of two english word
    """
    return simhash(s1).similarity(simhash(s2))
示例#36
0
            exact, subs, bags = count_matches(k1[1:], k2[1:])
            lenOk = len(w1) > 2 and len(w2) > 2

            cond1 = exact > 1 and lenOk  and (w1 == w2 or w1.find(w2) != -1 or w2.find(w1) != -1)
            cond2 = w1 == w2 and lenOk and len(k2) == 1 and len(k1) == 2
            cond3 = exact >= 2 and len(k2) == 3 and len(k1) == 3
            cond4 = w1 == w2 and lenOk and ((bags == 1 and len(k1)+len(k2) <= 5) or (bags == 2 and len(k1)+len(k2) <= 7))

            if cond1 or cond2 or cond3 or cond4:
                ds.union(p2, q2)
                g.write("%s\n%s\n\n" % (p1, q1))
    g.close()

    remaining = strings_remaining(strings, ds)
    print 'merged neighbours, remaining: %d' % len(remaining)

    hashes, dglinks = [], []
    for k,v in remaining:
        hashes.append(simhash(k, hashbits=HB))
        dglinks.append(set(digit_in[k]).union(set(digit_out[k])))
    print 'hashing done'

    merging = find_merges_by_hash(remaining, hashes, dglinks)
    print "indices found: %d" % len(merging)
    merge_sets(ds, remaining, merging)
    print 'merged by hashes'

    write_mapping(digits, ds, strings, N)
    print 'done'

示例#37
0
#!/usr/bin/env python

import pymongo
import bitstring

from hashes.simhash import simhash

connection = pymongo.Connection("localhost", 27017)

db = connection.pace
coll = db.people

for i in coll.find():
    h = simhash(i['firstName'] + i['lastName'], hashbits=64)

    ha = bitstring.pack('uint:64', long(h))
    for j in xrange(0, 8):
        i['h%s' % j] = ha.hex
        #print ha.hex
        ha.ror(64 / 8)

    coll.update({'n': i['n']}, i)
示例#38
0
from hashes.simhash import simhash

if __name__ == '__main__':
    f = open('flat.txt', 'r')
    #f = open('thingiverse_all_names.csv')
    data = [line.strip() for line in f.readlines()]
    f.close()

    # print data
    all_hashes = dict([(d, simhash(d)) for d in data])

    for k, h in all_hashes.items():
        print "%s %s" % (k, h)
        print all_hashes['Flatpack Bunny'].similarity(h)

示例#39
0
 def run(self, obj, config):
     self.config = config
     self.obj = obj
     user = self.current_task.user
     tlp_value = self.config.get("tlp_value", "tlp_value")
     url = obj['value']
     if not (obj._meta['crits_type'] == 'Indicator'
             and obj['ind_type'] == 'URI'):
         self._error('This object type cannot use service Url analysis.')
         return False
     #verify url http or https
     if url.startswith('https://') or url.startswith('http://'):
         #put url in file
         dcap = dict(DesiredCapabilities.PHANTOMJS)
         dcap["phantomjs.page.settings.userAgent"] = (
             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36"
         )
         driver = webdriver.PhantomJS(desired_capabilities=dcap,
                                      service_args=[
                                          '--ignore-ssl-errors=true',
                                          '--ssl-protocol=any',
                                          '--web-security=false'
                                      ])
         driver.set_window_size(1024, 768)
         driver.set_page_load_timeout(30)
         driver.get(url)
         time.sleep(3)
         #driver.save_screenshot('testing1.png')
         screen = driver.get_screenshot_as_png()
         ofile = io.BytesIO()
         im = Image.open(StringIO.StringIO(screen))
         im.save(ofile, 'PNG', optimize=True)
         ofile.seek(0)
         res = add_screenshot(description='Render of a website URL',
                              tags=None,
                              method=self.name,
                              source=obj.source,
                              reference=None,
                              analyst=self.current_task.user.username,
                              screenshot=ofile,
                              screenshot_ids=None,
                              oid=obj.id,
                              tlp=tlp_value,
                              otype="Indicator")
         if res.get('message') and res.get('success') == True:
             self._warning("res-message: %s id:%s" %
                           (res.get('message'), res.get('id')))
             self._add_result('ScreenShot URL', res.get('id'),
                              {'Message': res.get('message')})
         #parse HAR
         har = driver.get_log('har')
         if type(har) is list and har:
             if type(har[0]) is dict and 'message' in har[0]:
                 #change unicode to json
                 try:
                     har[0]['message'] = json.loads(har[0]['message'])
                 except:
                     self._warning('Har log error to parse json')
                 if type(har[0]['message']
                         ) is dict and 'log' in har[0]['message'] and type(
                             har[0]['message']['log']
                         ) is dict and 'pages' in har[0]['message']['log']:
                     if type(har[0]['message']['log']['pages']
                             ) is list and har[0]['message']['log'][
                                 'pages'] and type(har[0]['message']['log']
                                                   ['pages'][0]) is dict:
                         title = 'Result of '
                         if 'id' in har[0]['message']['log']['pages'][0]:
                             title += har[0]['message']['log']['pages'][0][
                                 'id']
                         if 'title' in har[0]['message']['log']['pages'][0]:
                             self._add_result(
                                 title, 'Title', {
                                     'value':
                                     har[0]['message']['log']['pages'][0]
                                     ['title']
                                 })
                     #parse each request and response
                     if 'entries' in har[0]['message']['log'] and type(
                             har[0]['message']['log']['entries']
                     ) is list and har[0]['message']['log']['entries']:
                         count = 1
                         type_r = ['cookies', 'queryString', 'headers']
                         type_rs = ['content', 'timings', 'cache']
                         for elem_rr in har[0]['message']['log']['entries']:
                             for k, v in elem_rr.iteritems():
                                 if type(v) is not dict:
                                     self._add_result(
                                         title +
                                         ' -- Informations Request & Response num:'
                                         + str(count), k, {'value': v})
                             for k, v in elem_rr.iteritems():
                                 if type(v) is dict:
                                     for kx, vx in v.iteritems():
                                         self._add_result(
                                             title +
                                             ' -- Informations Request & Response num:'
                                             + str(count) + ' -- ' + str(k),
                                             kx, {'value': vx})
                             count += 1
         #save page source in rawdata
         if not user.has_access_to(RawDataACL.WRITE):
             self._info(driver.page_source.encode('utf8'))
         else:
             #can write
             result = handle_raw_data_file(
                 driver.page_source.encode('utf8'),
                 obj.source,
                 user=self.current_task.user,
                 description="Code page for URL: %s" % url,
                 title=url,
                 data_type="Text",
                 tool_name=self.name,
                 tool_version=self.version,
                 tool_details=self.description)
             if result['success']:
                 obj.add_relationship(
                     result['object'],
                     RelationshipTypes.CONTAINED_WITHIN,
                     analyst=self.current_task.user.username,
                     rel_reason="Extracted from URI")
                 obj.save()
             self._add_result(
                 'Code Page', url, {
                     'RawData TLO ID':
                     result['_id'],
                     'md5 file':
                     md5(driver.page_source.encode('utf8')).hexdigest(),
                     'simhash':
                     str(simhash(driver.page_source.encode('utf8')))
                 })
         driver.close()
         driver.service.process.terminate()
         time.sleep(1)
         #get certificat information - ref: https://stackoverflow.com/questions/30862099/how-can-i-get-certificate-issuer-information-in-python
         #because selenium not functionnality
         if url.startswith('https://'):
             try:
                 host = urlparse(url).hostname
                 port = urlparse(url).port
                 if (port is None):
                     port = 443
                 s = socks.socksocket()
                 if settings.HTTP_PROXY:
                     type_proxy = socks.PROXY_TYPE_SOCKS5
                     if settings.HTTP_PROXY.startswith('http://'):
                         type_proxy = socks.PROXY_TYPE_HTTP
                     s.setproxy(type_proxy,
                                urlparse(settings.HTTP_PROXY).hostname,
                                port=urlparse(settings.HTTP_PROXY).port)
                 s.connect((host, port))
                 ss = ssl.wrap_socket(s)
                 pem_data = ssl.DER_cert_to_PEM_cert(ss.getpeercert(True))
                 ss.close()
                 s.close()
                 cert = M2Crypto.X509.load_cert_string(pem_data)
                 #put ssl information
                 self._add_result(
                     'SSL informations', 'Subject',
                     {'value': str(cert.get_subject().as_text())})
                 self._add_result(
                     'SSL informations', 'Issuer',
                     {'value': str(cert.get_issuer().as_text())})
                 self._add_result('SSL informations', 'Version',
                                  {'value': str(cert.get_version())})
                 self._add_result('SSL informations', 'Date before',
                                  {'value': str(cert.get_not_before())})
                 self._add_result('SSL informations', 'Date after',
                                  {'value': str(cert.get_not_after())})
                 self._add_result('SSL informations', 'Serial Number',
                                  {'value': str(cert.get_serial_number())})
                 self._add_result('SSL informations', 'Verify',
                                  {'value': str(cert.verify())})
                 self._add_result('SSL informations', 'Fingerprint MD5',
                                  {'value': str(cert.get_fingerprint())})
                 for i in range(0, cert.get_ext_count()):
                     self._add_result(
                         'SSL informations Extension',
                         str(cert.get_ext_at(i).get_name()),
                         {'value': str(cert.get_ext_at(i).get_value())})
                 #https://www.heikkitoivonen.net/m2crypto/api/M2Crypto.X509-module.html
             except:
                 self._error('Error: get certificate informations.')
             self._info(str(cert))
         driver.service.process.kill()
         driver.quit()
         self._info('END')
示例#40
0
def sh(f):
    return [simhash(open(f).read()), f]