def test_add_same_swf_to_db(self): swf_info = self.populate_swf_info() swf_info.duplicate = 1 # we've seen this swf before swf_id = swfu.add_swf_to_db(swf_info, self.db_conn) rows = swfu.get_swf_obj_from_db('hash', swf_info.hash, self.db_cursor) if rows: vector = swfu.str_to_vector(rows[0]['occ_vector']) swf_filename = rows[0]['local_path'] new_swf = 1 swf_info.filename = swf_filename swf_info.duplicate = new_swf swf_info.occ_vector = vector swf_id_2 = swfu.add_swf_to_db(swf_info, self.db_conn) if not self.db_cursor.execute('SELECT * FROM swf_obj WHERE id = %s', (swf_id, )): self.fail("Cannot find SWF in db") row_orig = self.db_cursor.fetchone() if not self.db_cursor.execute('SELECT * FROM swf_obj WHERE id = %s', (swf_id_2, )): self.fail("Cannot find second SWF in db") row_second = self.db_cursor.fetchone() self.compare_swf_info(row_orig, swf_info) self.compare_swf_info(row_second, swf_info)
def should_crawl_and_find_swfs(self, url, expected_strings=(None, None), unexpected_strings=(None, None), crawler_type='chrome_lazy'): expected_fields, expected_values = expected_strings unexpected_fields, unexpected_values = unexpected_strings crawl_id = ag.crawl_sites([(1, url),], crawler_type, num_crawl_urls=1) db_conn = dbu.mysql_init_db() db_curs = db_conn.cursor(mdb.cursors.DictCursor) rows = swu.get_swf_obj_from_db('crawl_id', int(crawl_id), db_curs) if expected_fields: found_dict = {} for field in expected_fields: found_dict[field] = False for row in rows: if expected_fields: for expected_field, expected_value in zip(expected_fields, expected_values): if expected_value in row[expected_field]: found_dict[expected_field] = True print 'found in ', row[expected_field] if unexpected_values: for unexpected_field, unexpected_value in zip(unexpected_fields, unexpected_values) : if unexpected_value in row[unexpected_field]: self.fail('Unexpected field %s with unexpected value %s found' %(unexpected_field, unexpected_value)) if expected_fields: for field, found in found_dict.iteritems(): if not found: self.fail('Cannot find %s' % field) db_curs.close() db_conn.close()
def test_get_swf_obj_from_db(self): swf_info = self.populate_swf_info() swf_id = swfu.add_swf_to_db(swf_info, self.db_conn) rows = swfu.get_swf_obj_from_db('id', swf_id, self.db_cursor) self.assert_(len(rows), 'No SWF can be found in DB') for row in rows: self.assertTrue('http' in row['swf_url'], 'swf url is does not have http in it')
def store_swfs(msg, crawl_id, dir_path='/tmp', prefix='?'): referer = msg.request.headers['Referer'][0] if msg.request.headers['Referer'] else "" if msg.response and msg.response.content: print msg.request.get_url() if (msg.response.content[:3] in SWF_MAGIC_NUMBERS): # to wide, but decompiler will discard them swf_hash = ut.hash_text(msg.response.content) swf_url = msg.request.get_url() db_conn = dbu.mysql_init_db() db_cursor = db_conn.cursor(dbu.mdb.cursors.DictCursor) rows = swu.get_swf_obj_from_db('hash', swf_hash, db_cursor) if not rows: swf_filename = os.path.join(dir_path, "%s-%s" % (prefix, msg.request.path.split('/')[-1])) swf_filename = swf_filename[:MAX_FILENAME_LEN] if not swf_filename.endswith('.swf'): swf_filename += '.swf' wl_log.info("SWF saved %s referrer: %s" % (os.path.basename(swf_filename), referer)) fu.write_to_file(swf_filename, msg.response.content) vector = swu.get_occurence_vector_from_swf(swf_filename, os.path.join(dir_path, prefix)) duplicate_swf = 0 else: wl_log.info("A swf with same hash exists in DB: %s %s" % (swf_hash, swf_url)) vector = swu.str_to_vector(rows[0]['occ_vector']) swf_filename = rows[0]['local_path'] duplicate_swf = 1 rank, domain = prefix.rsplit('/')[-1].split('-', 1) swf_info = swu.SwfInfo() swf_info.rank = rank # this might be fake swf_info.domain = domain swf_info.local_path = swf_filename swf_info.occ_vector = vector swf_info.hash = swf_hash swf_info.url = swf_url swf_info.referer = referer swf_info.duplicate = duplicate_swf # !!! Y for repeated swfs(that we know before) swf_info.feat_vector = [] swf_info.page_url = '' swf_info.occ_string = ' '.join(swu.human_readable_occ_vector(vector)) swf_info.crawl_id = crawl_id swu.add_swf_to_db(swf_info, db_conn) db_conn.commit() db_cursor.close() db_conn.close() elif '.swf' in msg.request.path: wl_log.warning(".swf in path but content seems non-swf %s %s" % (msg.request.path, msg.response.content[:100])) else: pass
def should_crawl_and_find_swfs(self, url, expected_strings=(None, None), unexpected_strings=(None, None), crawler_type='chrome_lazy'): expected_fields, expected_values = expected_strings unexpected_fields, unexpected_values = unexpected_strings crawl_id = ag.crawl_sites([ (1, url), ], crawler_type, num_crawl_urls=1) db_conn = dbu.mysql_init_db() db_curs = db_conn.cursor(mdb.cursors.DictCursor) rows = swu.get_swf_obj_from_db('crawl_id', int(crawl_id), db_curs) if expected_fields: found_dict = {} for field in expected_fields: found_dict[field] = False for row in rows: if expected_fields: for expected_field, expected_value in zip( expected_fields, expected_values): if expected_value in row[expected_field]: found_dict[expected_field] = True print 'found in ', row[expected_field] if unexpected_values: for unexpected_field, unexpected_value in zip( unexpected_fields, unexpected_values): if unexpected_value in row[unexpected_field]: self.fail( 'Unexpected field %s with unexpected value %s found' % (unexpected_field, unexpected_value)) if expected_fields: for field, found in found_dict.iteritems(): if not found: self.fail('Cannot find %s' % field) db_curs.close() db_conn.close()
def store_swfs(msg, crawl_id, dir_path='/tmp', prefix='?'): referer = msg.request.headers['Referer'][0] if msg.request.headers[ 'Referer'] else "" if msg.response and msg.response.content: print msg.request.get_url() if (msg.response.content[:3] in SWF_MAGIC_NUMBERS ): # to wide, but decompiler will discard them swf_hash = ut.hash_text(msg.response.content) swf_url = msg.request.get_url() db_conn = dbu.mysql_init_db() db_cursor = db_conn.cursor(dbu.mdb.cursors.DictCursor) rows = swu.get_swf_obj_from_db('hash', swf_hash, db_cursor) if not rows: swf_filename = os.path.join( dir_path, "%s-%s" % (prefix, msg.request.path.split('/')[-1])) swf_filename = swf_filename[:MAX_FILENAME_LEN] if not swf_filename.endswith('.swf'): swf_filename += '.swf' wl_log.info("SWF saved %s referrer: %s" % (os.path.basename(swf_filename), referer)) fu.write_to_file(swf_filename, msg.response.content) vector = swu.get_occurence_vector_from_swf( swf_filename, os.path.join(dir_path, prefix)) duplicate_swf = 0 else: wl_log.info("A swf with same hash exists in DB: %s %s" % (swf_hash, swf_url)) vector = swu.str_to_vector(rows[0]['occ_vector']) swf_filename = rows[0]['local_path'] duplicate_swf = 1 rank, domain = prefix.rsplit('/')[-1].split('-', 1) swf_info = swu.SwfInfo() swf_info.rank = rank # this might be fake swf_info.domain = domain swf_info.local_path = swf_filename swf_info.occ_vector = vector swf_info.hash = swf_hash swf_info.url = swf_url swf_info.referer = referer swf_info.duplicate = duplicate_swf # !!! Y for repeated swfs(that we know before) swf_info.feat_vector = [] swf_info.page_url = '' swf_info.occ_string = ' '.join( swu.human_readable_occ_vector(vector)) swf_info.crawl_id = crawl_id swu.add_swf_to_db(swf_info, db_conn) db_conn.commit() db_cursor.close() db_conn.close() elif '.swf' in msg.request.path: wl_log.warning(".swf in path but content seems non-swf %s %s" % (msg.request.path, msg.response.content[:100])) else: pass