def get_output_by_input_with_fil(self, **kwargs): if self.W_fil is None: self.get_fil_weight() input_val = kwargs.get("input_val", []) tmp_output = T.dot(input_val, self.W_fil) + self.b_h_fil nxt_val, pp_val = tmp_output[:, 0:self.n_h], tmp_output[:, self.n_h:self.n_v] output_val = T.nnet.sigmoid(nxt_val) output_val = get_val(output_val) pp_val = get_val(pp_val) logger.info("fil shape %s", output_val.shape) return output_val, pp_val
def common_property(self, response, item, spider_item_field=None): # 强化增发请求后获取字段的能力,增加请求可以使用该方法同时获取多个字段,同时还可以继续增发请求,理论上支持无限增发 field = spider_item_field or copy.deepcopy(ITEM_FIELD[self.name]) while field: k, v = field.pop(0) # 只有在spider_item_field存在且没有要求进一步增发请求时,才会判定会是增发请求后的操作 # 也就是说,如果增发请求后有字段被要求再次增发,当前(未再次增发之前)的处理函数会调用function和extract,不会调用after后缀的函数 # 对于完成过一次增发请求后不需要再次增发的字段,函数带不带after后缀并没有什么区别 add by msc 2016.11.25 is_after = (True if spider_item_field is not None else False) and not v.get("request") val = get_val(v, response, item, is_after) request_func = v.get("request") if request_func: if not val: request = send_request_wrapper(response, item, k)(request_func)() if request: return request # 获取值的顺序为,增发请求后的值优先,其次是增发前的值,其次是默认值。 add by msc 2016.11.25 item[k] = val or item.get(k) or v.get("default", "") # 对于有增发请求的需求的字段,不管要不要增发请求,都会停止获取需要增发请求的字段之后字段的值 # 这么做的目的是为了防止没有对于不确定是否增发的字段,若未增发,后续字段会在第一个响应的页面中获取后续字段的值 add by msc 2016.11.26 if request_func: break
def next_request_callback(self, response): k = response.meta.get("next_key") self.logger.debug("start in parse %s ..." % k) filed_list = ITEM_FIELD[self.name] v = filter(lambda x: x, map(lambda x: x if x[0] == k else "", filed_list))[0][1] item = self.reset_item(response.meta['item_half']) item[k] = get_val(v, response, item, is_after=True) or v.get("default", "") self.logger.info("crawlid:%s, product_id %s, suceessfully yield item" % (item.get("crawlid"), item.get("product_id", "unknow"))) self.crawler.stats.inc_crawled_pages(response.meta['crawlid']) return item
def common_property(self, response, item): for k, v in ITEM_FIELD[self.name]: val = get_val(v, response, item) if not val: request_func = v.get("request") if request_func: request = send_request_wrapper(response, item, k)(request_func)() if request: return request item[k] = val or v.get("default", "")
def parse_edb_cve(self, edb_item): raw_id = edb_item['id'] edb_id = "EDB-{}".format(raw_id) edb_url = "https://www.exploit-db.com/exploits/{}/".format(raw_id) element_xpath = { 'edb_id': '/html/body/div/div[2]/div[2]/div/div/div[1]/div/div[2]/div[1]/div[1]/div/div[1]/div/div/div/div[1]/h6/text()', 'edb_title': '/html/body/div/div[2]/div[2]/div/div/div[1]/div/div[1]/h1/text()', 'edb_author': '/html/body/div/div[2]/div[2]/div/div/div[1]/div/div[2]/div[1]/div[2]/div/div[1]/div/div/div/div[1]/h6/a/text()', 'edb_published': '/html/body/div/div[2]/div[2]/div/div/div[1]/div/div[2]/div[1]/div[3]/div/div[1]/div/div/div/div[2]/h6/text()', 'edb_cve': '/html/body/div/div[2]/div[2]/div/div/div[1]/div/div[2]/div[1]/div[1]/div/div[1]/div/div/div/div[2]/h6/a/text()', 'edb_type': '/html/body/div/div[2]/div[2]/div/div/div[1]/div/div[2]/div[1]/div[2]/div/div[1]/div/div/div/div[2]/h6/a/text()', 'edb_platform': '/html/body/div/div[2]/div[2]/div/div/div[1]/div/div[2]/div[1]/div[3]/div/div[1]/div/div/div/div[1]/h6/a/text()', 'edb_vulnerable_app_url': '/html/body/div/div[2]/div[2]/div/div/div[1]/div/div[2]/div[1]/div[3]/div/div[2]/div/a/@href', 'edb_verified': '/html/body/div/div[2]/div[2]/div/div/div[1]/div/div[2]/div[1]/div[1]/div/div[2]/div/i/@class', 'edb_exploit_raw_url': '/html/body/div/div[2]/div[2]/div/div/div[1]/div/div[2]/div[1]/div[2]/div/div[2]/div/a[2]/@href', } page = self.request(edb_url) try: raw_id = page.html.xpath( element_xpath['edb_id'])[0].strip(':').strip() edb_id = "EDB-{}".format(raw_id) except Exception: logging.error("Request error, maybe record have been removed") exploit_record = EdbRecord(edb_id=raw_id) self.insert_record(exploit_record) edb_title = get_val(page.html.xpath(element_xpath['edb_title'])) edb_author = get_val(page.html.xpath(element_xpath['edb_author'])) try: edb_cve_num = [ i.strip() for i in page.html.xpath(element_xpath['edb_cve']) ] if edb_cve_num != '' and edb_cve_num != 'N/A': maped_edb_cve = [ "CVE-{}".format(cve_id) for cve_id in edb_cve_num ] edb_cve = ','.join(maped_edb_cve) except Exception: edb_cve = 'N/A' edb_type = get_val(page.html.xpath(element_xpath['edb_type'])) edb_platform = get_val(page.html.xpath(element_xpath['edb_platform'])) edb_verified = get_val(page.html.xpath(element_xpath['edb_verified'])) if 'mdi-close' in edb_verified: edb_verified = 'Unverified' else: edb_verified = 'Verified' edb_exploit_raw_url = 'https://www.exploit-db.com/raw/{}'.format( raw_id) edb_vulnerable_app_url = get_val( page.html.xpath(element_xpath['edb_vulnerable_app_url'])) if edb_vulnerable_app_url != "": edb_vulnerable_app_url = 'https://www.exploit-db.com' + edb_vulnerable_app_url edb_published = page.html.xpath( element_xpath['edb_published'])[0].strip(':').strip() edb_collect_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) exploit_record = EdbRecord( edb_id=edb_id, edb_title=edb_title, edb_url=edb_url, edb_author=edb_author, edb_cve=edb_cve, edb_type=edb_type, edb_platform=edb_platform, edb_verified=edb_verified, edb_vulnerable_app_url=edb_vulnerable_app_url, edb_exploit_raw_url=edb_exploit_raw_url, edb_published=edb_published, edb_collect_date=edb_collect_date) self.insert_record(exploit_record)
import utils import cv2 path = 'resources/test.jpeg' utils.initialize_trackbar() while True: t1, t2 = utils.get_val() # print(t1,t2) #Load Image img = cv2.imread(path) # print(img.shape) #Perform edge detection,dilation and erosion pp_img, orig_copy = utils.preprocess(img, t1, t2) imgContours = orig_copy.copy() #Copying image for display purposes #Finding contours from the binary image contours = utils.get_contours(pp_img) biggest_contours = utils.getBigCntr(contours) cv2.drawContours(imgContours, contours, -1, (0, 255, 0), 2) reorderd_points = utils.reorder(biggest_contours) # print("reordered points : ",len(reorderd_points),type(reorderd_points)) #Apply perspective transform on the resized image final = utils.get_perspective(reorderd_points, orig_copy) #display the images cv2.imshow("Image", orig_copy) cv2.imshow("Preprocessed", pp_img)
def convert_hca_json_to_magetab(mode, data_dir, project_uuids_filter=None, new_only=True, sender=None, email_recipients=None): # Retrieve the HCA Json to MAGETAB translation config config = utils.get_config(process_name) idf_config = utils.get_val(config, 'idf') sdrf_config = utils.get_val(config, 'sdrf') logger = utils.create_logger(data_dir, process_name, mode) hca_api_url_root = utils.get_val(config, 'hca_api_url_root') # already_imported_project_uuids will be excluded from the import (and their json will not be cached) if new_only: already_imported_project_uuids = utils.get_previously_imported_projects( data_dir) else: already_imported_project_uuids = [] project_uuids = hcadam.get_hca_project_uuid_to_import( hca_api_url_root, config, mode, project_uuids_filter, already_imported_project_uuids, logger) # project_uuid2gxa_accession dict forms the worklist of experiments to be imported from HCA project_uuid2gxa_accession = {} for project_uuid in project_uuids: project_uuid2gxa_accession[ project_uuid] = hcadam.get_gxa_accession_for_project_uuid( project_uuid, config) project_uuid2gxa_accession = utils.resolve_gxa_accession_for_project_uuid( data_dir, project_uuid2gxa_accession) # Experiments imported from HCA DCC - for email report imported_experiments = [] # Log experiments to be imported logger.info("About to import from HCA DCC the following experiments:") for project_uuid in project_uuid2gxa_accession.keys(): logger.info("%s -> %s" % (project_uuid, project_uuid2gxa_accession[project_uuid])) # Metadata retrieve starts here for project_uuid in project_uuid2gxa_accession.keys(): time_start = utils.unix_time_millis(datetime.now()) accession = project_uuid2gxa_accession.get(project_uuid) if new_only: # N.B. if new_only is True, HCA projects for which an idf file in data_dir doesn't exist will be imported idf_file_path = '%s/%s.idf.txt*' % (data_dir, accession) if glob.glob(idf_file_path): logger.info( "Not importing %s as %s already exists (new_only mode: %s)" % (accession, idf_file_path, str(new_only))) continue else: logger.info( 'About to translate json for HCA study uuid: %s to magetab for gxa accession: %s' % (project_uuid, accession)) # Retrieve all HCA json content for project_uuid hca_json_for_project_uuid = hcadam.get_json_for_project_uuid( project_uuid) # Initialise SDRF-related data structures and flags # Set of technologies found in bundles for a given project uuid. The presence of a technology name in that set acts as a flag that sdrf column headers have been collected for that technology. technologies_found = set([]) # List of SDRF column headers (per technology) that will be output in each (technology-specific) sdrf file technology2sdrf_column_headers = {} # List value corresponding to each technology key in technology2rows dict will be used to accumulate rows of data to be output into the generated (technology-specific) SDRF file technology2rows = {} # For a given technology key, after all the bundles for a given project have been seen, the value (set) indexes of sdrf columns that are empty for this technology # (and therefore will be removed before the sdrf matrix is output into the sdrf file) # N.B. Before any bundles are seen, all columns are assumed to be empty until at least one value is encountered for each. technology2indexes_of_empty_columns = {} # Initialise IDF-related data structures (for data such as protocols - that need to be collected from all the bundles) # technology2protocol_type2protocols is used to store all protocol names - to be populated later inside IDF file technology2protocol_type2protocols = {} # technology2protocol_type2max_protocol_num_per_sample stores maximum number of protocols per technology-protocol_type in any given sample/bundle. # This number will dictate how many 'Protocol REF' columns should be output for that protocol_type in sdrf file for that technology technology2protocol_type2max_protocol_num_per_sample = {} # characteristic_values_in_bundle dict stores sets of (unique) values per characteristic found - in order # to later automatically generate the corresponding Factors - for all characteristics for which the values change across the experiment (across all technologies). # N.B. A simplifying assumption is made here that in a multi-technology experiment, each technology-specific portion will get the same Factors characteristic_values = OrderedDict() # Auxiliary counter - used to limit number of HCA bundles processed during testing bundle_cnt = 0 for bundle_url in hca_json_for_project_uuid.keys(): # We want to warn of missing fields for the first bundle (since each bundle will contain some technology), the test below # effectively checks if we're dealing with the first bundle or not warn_of_missing_fields = not technologies_found hca_json_for_bundle = hca_json_for_project_uuid[bundle_url] context = (accession, project_uuid, bundle_url) #################################################### #### Collect protocols for IDF from bundle_uuid #### #################################################### protocol_type2protocols_in_bundle = OrderedDict([]) for protocol_key in utils.get_val(config, 'protocol_types'): protocol_type2protocols_in_bundle[protocol_key] = OrderedSet( []) for schema_type in list(hca_json_for_bundle.keys()): if re.search(r"" + protocol_key, schema_type): for protocol_json in hca_json_for_bundle[schema_type]: protocol_name = utils.get_hca_value( utils.get_val(config, 'hca_protocol_name_path'), protocol_json, logger, config, False, 'Protocol Name', context) if protocol_name != utils.get_val( config, 'notfound'): protocol_description = utils.get_hca_value( utils.get_val( config, 'hca_protocol_description_path'), protocol_json, logger, config, False, 'Protocol Description', context) protocol_type = utils.get_hca_value( utils.get_val(config, 'hca_protocol_type_path'), protocol_json, logger, config, False, 'Protocol Type', context) protocol_type2protocols_in_bundle[ protocol_key].add( (protocol_name, protocol_description, protocol_type)) ################## ###### SDRF ###### ################## technology = None # Set of indexes of sdrf columns with non-empty sdrf columns for the current bundle_uuid indexes_of_non_empty_sdrf_columns = set([]) # Output one SDRF row per each donor - sequence file tuple in the bundle # Assumptions relating to donors: # 1. Every HCA bundle has at least one json object for both: donor_organism and cell_suspension # 2. When multiple donor_organism and cell_suspension json objects exist, in the lists of json objects for donor_organism and # cell_suspension respectively, the first JSON in donor_organism list corresponds to the first JSON in the cell_suspension list, and so on. # However, in multi-donor samples with just one cell_suspension json object (e.g. project_uuid: d96c2451-6e22-441f-a3e6-70fd0878bb1b, # bundle_url: https://dss.data.humancellatlas.org/v1/bundles/fb64e4f9-9a24-4a6a-856f-2b7c0d4f309d?version=2019-01-03T153203.452910Z&replica=aws # that single cell_suspension json is assumed to apply to all donor_organism json objects in that bundle. donor_json_list = hca_json_for_bundle[utils.get_val( config, 'hca_donor_organism')] cell_suspension_json_list = hca_json_for_bundle[utils.get_val( config, 'hca_cell_suspension')] if len(cell_suspension_json_list) != len(donor_json_list) and len( cell_suspension_json_list) != 1: err_msg = " Project: %s bundle: %s contain multiple donor_organism and cell_suspension jsons, but their number is not the same" % ( project_uuid, bundle_url) logger.error(err_msg) raise utils.HCA2MagetabTranslationError(err_msg) i = 0 while i < len(donor_json_list): donor_json = donor_json_list[i] if len(cell_suspension_json_list) > 1: cell_suspension_json = cell_suspension_json_list[i] else: cell_suspension_json = cell_suspension_json_list[0] i += 1 for datafile_json in hca_json_for_bundle[utils.get_val( config, 'hca_sequence_file')]: sdrf_column_headers = [] current_row = [] for line_item in sdrf_config: magetab_label = line_item[0] hca_path = line_item[1] if isinstance(hca_path, list): if not utils.position_valid_for_sdrf_column( magetab_label, sdrf_column_headers, config): # Skip sdrf columns if the position in which they would be inserted would not be valid given the column just before: sdrf_column_headers[-1] continue elif magetab_label in [ 'Characteristics[geographical location]', 'Characteristics[genotype]' ]: # Special handling/parsing - geographical location - multiple json files need checking for field presence value = utils.get_val(config, 'notfound') regex = hca_path[0] for schema_type in list( hca_json_for_bundle.keys()): if re.search(r"" + regex, schema_type): for json_dict in hca_json_for_bundle[ schema_type]: value = utils.get_hca_value( hca_path[1:], json_dict, logger, config, warn_of_missing_fields, magetab_label, context) if value != utils.get_val( config, 'notfound'): break utils.add_to_row( indexes_of_non_empty_sdrf_columns, sdrf_column_headers, magetab_label, value, current_row, characteristic_values, config) elif magetab_label == 'Protocol REF': protocol_type = hca_path[0] # TODO: # Note that before sdrf is output, we will need to split protocol_ids into separate columns, but not before processing all the bundles in the project - we have to wait till the # end to we know how many protocols per technology-protocol type we have. _In theory_ we could have 3 different enrichment protocols for a given technology in a project, e.g. # FACS3, FACS5 and FACS8, and for the current bundle protocol_ids = 'FACS3,FACS8'. Then before outputting sdrf we will have to 'explode' the 'Protocol REF' column corresponding # to the enrichment protocol into 3 (tab-delimited) new columns - and these columns for the current bundle_uuid will have values: 'FACS3\tFACS8\t' and # headers: 'Protocol REF\tProtocol REF\tProtocol REF' protocol_ids = ','.join([ x[0] for x in list( protocol_type2protocols_in_bundle[ protocol_type]) ]) utils.add_to_row( indexes_of_non_empty_sdrf_columns, sdrf_column_headers, magetab_label, protocol_ids, current_row, characteristic_values, config) elif len(hca_path) > 0 and re.search( r"" + utils.get_val( config, 'hca_protocol_schema_regex'), hca_path[0]): protocol_type = hca_path[0] # Special handling/parsing - for a given protocol_type, various protocol-related information needs to be collected from potentially multiple HCA json files values = set([]) for schema_type in [ x for x in hca_json_for_bundle.keys() if x == protocol_type ]: for json_dict in hca_json_for_bundle[ schema_type]: value = utils.get_hca_value( hca_path[1:], json_dict, logger, config, warn_of_missing_fields, magetab_label, context) if value != utils.get_val( config, 'notfound'): if magetab_label == 'Comment[library construction]': # Capture technology for the current bundle hca_technology = value.lower() technology = utils.get_gxa_technology( hca_technology, config) value = technology values.add(str(value)) utils.add_to_row( indexes_of_non_empty_sdrf_columns, sdrf_column_headers, magetab_label, ', '.join(values), current_row, characteristic_values, config) elif magetab_label == 'Comment[HCA bundle url]': utils.add_to_row( indexes_of_non_empty_sdrf_columns, sdrf_column_headers, magetab_label, bundle_url, current_row, characteristic_values, config) elif magetab_label in [ 'Comment[RUN]', 'Comment[FASTQ_URI]', 'Scan Name', 'Comment[technical replicate group]', 'Comment[HCA file uuid]' ]: # Special handling/parsing - Comment[RUN] - datafile_key json file need checking for field presence value = utils.get_hca_value( hca_path, datafile_json, logger, config, warn_of_missing_fields, magetab_label, context) if magetab_label == 'Comment[RUN]': # NB. We're stripping e.g. _2.fastq.gz from the end - to retain just the core file name # Tested on the following types of file names: # "FCA7167226_I1.fastq.gz", "MantonBM7_HiSeq_4_S19_L005_R2_001.fastq.gz", "E18_20160930_Neurons_Sample_57_S054_L005_I1_010.fastq.gz", "FCA7167226.fastq.gz" value = re.sub( r"(\_\w\d|\_\w\d\_\d+|\_\d)*\.f\w+\.gz", "", value) utils.add_to_row( indexes_of_non_empty_sdrf_columns, sdrf_column_headers, magetab_label, value, current_row, characteristic_values, config) else: schema_type = hca_path[0] if schema_type in hca_json_for_bundle: if schema_type == utils.get_val( config, 'hca_donor_organism'): json_dict = donor_json elif schema_type == utils.get_val( config, 'hca_cell_suspension'): json_dict = cell_suspension_json else: # Retrieving the first element below follows the assumption of one single json object in schema_type in a bundle # (all the special cases were handled above) json_dict = hca_json_for_bundle[ schema_type][0] value = utils.get_hca_value( hca_path[1:], json_dict, logger, config, warn_of_missing_fields, magetab_label, context) else: value = utils.get_val(config, 'notfound') if magetab_label in \ ['Characteristics[organism]', 'Characteristics[disease]', 'Characteristics[cell subtype]', 'Characteristics[ethnic group]','Characteristics[strain]'] \ and value != utils.get_val(config, 'notfound'): # Special handling/parsing - organism, disease - could be multiple according to HCA schema utils.add_to_row( indexes_of_non_empty_sdrf_columns, sdrf_column_headers, magetab_label, ','.join([x['text'] for x in value]), current_row, characteristic_values, config) else: # magetab_label is not a list or a special case utils.add_to_row( indexes_of_non_empty_sdrf_columns, sdrf_column_headers, magetab_label, str(value), current_row, characteristic_values, config) else: # hca_path is not a list - add to the row as is utils.add_to_row(indexes_of_non_empty_sdrf_columns, sdrf_column_headers, magetab_label, hca_path, current_row, characteristic_values, config) # At least one bundle has been seen - the SDRF columns have now been determined if technology: # Append current_row to the list of rows in the SDRF file being generated if technology not in technology2rows.keys(): technology2rows[technology] = [] technology2rows[technology].append(current_row) # The presence of a technology name in that set acts as a flag that sdrf column headers have been collected for that technology. if technology not in technologies_found: technology2sdrf_column_headers[ technology] = sdrf_column_headers # To start off with assume all columns are empty technology2indexes_of_empty_columns[ technology] = range(len(sdrf_config)) # Initialise technology2protocol_type2protocols with new technology technology2protocol_type2protocols[ technology] = OrderedDict() technologies_found.add(technology) # Store (without duplicates) for technology the protocols found for bundle_uuid (i.e. those in protocol_type2protocols_in_bundle) for protocol_type in protocol_type2protocols_in_bundle.keys( ): num_protocols_in_bundle = len( protocol_type2protocols_in_bundle[ protocol_type]) if num_protocols_in_bundle > 0: if technology not in technology2protocol_type2max_protocol_num_per_sample.keys( ): technology2protocol_type2max_protocol_num_per_sample[ technology] = OrderedDict({ protocol_type: num_protocols_in_bundle }) elif protocol_type not in technology2protocol_type2max_protocol_num_per_sample[ technology].keys(): technology2protocol_type2max_protocol_num_per_sample[ technology][ protocol_type] = num_protocols_in_bundle else: technology2protocol_type2max_protocol_num_per_sample[ technology][protocol_type] = max( num_protocols_in_bundle, technology2protocol_type2max_protocol_num_per_sample[ technology][protocol_type]) if protocol_type not in technology2protocol_type2protocols[ technology].keys(): technology2protocol_type2protocols[ technology][ protocol_type] = OrderedSet([]) # Merge set: protocol_type2protocols_in_bundle[protocol_type] into set already in technology2protocol_type2protocols[technology][protocol_type] technology2protocol_type2protocols[technology][ protocol_type] |= protocol_type2protocols_in_bundle[ protocol_type] else: err_msg = "Failed to retrieve valid technology from value: \"%s\" in bundle: %s" % ( hca_technology, bundle_url) logger.error(err_msg) raise utils.HCA2MagetabTranslationError(err_msg) # Now remove from technology2indexes_of_empty_columns[technology] all column indexes we found non-empty values for, for the current bundle_uuid technology2indexes_of_empty_columns[technology] = [ x for x in technology2indexes_of_empty_columns[technology] if x not in indexes_of_non_empty_sdrf_columns ] # Number of bundles processed per study - test mode cut-off if mode == 'test' and bundle_cnt >= utils.get_val( config, 'test_max_bundles'): break # Now work out which Characteristics should be auto-generated as Factors also technology2factors = {} # Assumption - in experiments imported from HCA DCC, only one column for a unique characteristic name will be output in the resulting SDRF file technology2factor2characteristic_colnum = {} for technology in technologies_found: technology2factors[technology] = [] technology2factor2characteristic_colnum[technology] = {} for characteristic in characteristic_values: if characteristic in technology2sdrf_column_headers[ technology] and len( characteristic_values[characteristic]) > 1: factor = re.sub("Characteristics", "FactorValue", characteristic) technology2factors[technology].append(factor) technology2sdrf_column_headers[technology].append(factor) # Store index (in each sdrf row) of the characteristic corresponding factor, so that we know where to get the value from # when populating factor values in sdrf later technology2factor2characteristic_colnum[technology][ factor] = technology2sdrf_column_headers[ technology].index(characteristic) # Add Factor for single cell identifier (smart-seq2 experiments only) smart_regex = re.compile('smart-.*$') if smart_regex.match(technology): factor = 'FactorValue[single cell identifier]' technology2sdrf_column_headers[technology].append(factor) technology2factors[technology].append(factor) technology2factor2characteristic_colnum[technology][ factor] = technology2sdrf_column_headers[technology].index( 'Source Name') # For each technology, write out the generated SDRF file. # N.B. IF the HCA project is multi-technology, append the technology label to the end of the sdrf file name multi_technology_hca_project = len(technologies_found) > 1 for technology in technologies_found: sdrf_file_name = "%s.sdrf.txt" % accession if multi_technology_hca_project: sdrf_file_name = "%s.%s" % (sdrf_file_name, technology) with open(os.path.join(data_dir, sdrf_file_name), 'wb') as f: csvwriter = csv.writer(f, delimiter='\t', encoding='utf-8', escapechar='\\', quotechar='', lineterminator='\n', quoting=csv.QUOTE_NONE) # Remove from technology2sdrf_column_headers[technology] headers of columns that are empty for this technology utils.remove_empty_columns( technology2sdrf_column_headers[technology], technology2indexes_of_empty_columns[technology]) # Expand protocol column headers to account for multiple protocols per protocol_type, if applicable expanded_headers = technology2sdrf_column_headers[ technology].copy() utils.expand_protocol_columns( None, expanded_headers, technology2protocol_type2max_protocol_num_per_sample[ technology], logger) # Write out sdrf header line csvwriter.writerow(expanded_headers) for row in technology2rows[technology]: # Append to row values for all the auto-generated factors for factor in technology2factors[technology]: row.append(row[technology2factor2characteristic_colnum[ technology][factor]]) # Remove from row elements in positions corresponding to columns that are empty for this technology utils.remove_empty_columns( row, technology2indexes_of_empty_columns[technology]) # Expand protocol values into multiple columns to account for multiple protocols per protocol_type, if applicable utils.expand_protocol_columns( row, technology2sdrf_column_headers[technology], technology2protocol_type2max_protocol_num_per_sample[ technology], logger) # Write out sdrf data line csvwriter.writerow(row) ################# ###### IDF ###### ################# for technology in technologies_found: idf_file_name = "%s.idf.txt" % accession if multi_technology_hca_project: idf_file_name = "%s.%s" % (idf_file_name, technology) with open(os.path.join(data_dir, idf_file_name), 'wb') as f: csvwriter = csv.writer(f, delimiter='\t', encoding='utf-8', escapechar='\\', quotechar='', lineterminator='\n', quoting=csv.QUOTE_NONE) for line_item in idf_config: magetab_label = line_item[0] hca_path = line_item[1] if isinstance(hca_path, list): if magetab_label in [ 'Term Source Name', 'Term Source File' ]: # Special handling/parsing - hca_path is a list of literal values, rather than locations in HCA json files csvwriter.writerow([magetab_label] + hca_path) continue if hca_path: # Note the assumption that only one project_json object exists per bundle # (c.f. hca_schemas_with_one_json_per_bundle_expected in hca2mtab.yml) json_dict = hca_json_for_bundle[hca_path[0]][0] value = utils.get_hca_value(hca_path[1:], json_dict, logger, config, True, magetab_label, context) if magetab_label in [ 'Public Release Date' ] and value != utils.get_val(config, 'notfound'): # Special handling/parsing - Public Release date, Comment[HCALastUpdateDate], Comment[HCAReleaseDate] m = re.search(r'^(\d{4}\-\d{2}\-\d{2}).*$', value) if m: value = m.group(1) else: logger.error( "Failed to parse date out of: %s" % value) value = '' csvwriter.writerow([magetab_label, value]) elif magetab_label in [ 'Comment[ExpressionAtlasAccession]', 'SDRF File' ]: # Special handling/parsing - use previously derived accession value = accession if magetab_label == 'SDRF File': # SDRF file name - derive from experiment accession value = re.sub(r'\.idf\.', '.sdrf.', idf_file_name) candidate_acc_regex_obj = re.compile('E-CAND-\d+') if magetab_label == 'SDRF File' or ( magetab_label == 'Comment[ExpressionAtlasAccession]' and not candidate_acc_regex_obj.match( accession)): csvwriter.writerow([magetab_label, value]) elif magetab_label in ['Comment[HCALastUpdateDate]']: csvwriter.writerow([ magetab_label, datetime.now().strftime("%Y-%m-%d") ]) elif magetab_label == 'Comment[SecondaryAccession]': # Special handling - secondary accessions secondary_accessions = OrderedSet([]) for label in utils.get_val( config, 'hca_old_secondary_accessions_labels'): hca_project_json = hca_json_for_bundle[ utils.get_val(config, 'hca_project')] if label in hca_project_json: secondary_accessions.add( hca_project_json[label]) # For the reason for the loop below see a comment near hca_old_secondary_accessions_labels in hca2mtab.yml for label in utils.get_val( config, 'hca_new_secondary_accessions_labels'): if label in hca_project_json: for secondary_accession in hca_project_json[ label]: secondary_accessions.add( secondary_accession) # Now append the HCA study uuid secondary_accessions.add(project_uuid) if len(secondary_accessions) > 0: csvwriter.writerow( ['Comment[SecondaryAccession]'] + list(secondary_accessions)) elif magetab_label in [ 'Experimental Factor Name', 'Experimental Factor Type' ]: # Special handling - populate factors that where auto-generated in SDRF above idf_line = [magetab_label] for factor in technology2factors[technology]: m = re.search(r'\[(.*)\]', factor) if m: idf_line.append(m.group(1)) else: err_msg = "Failed to extract Factor name from %s" % factor logger.error(err_msg) raise utils.HCA2MagetabTranslationError( err_msg) csvwriter.writerow(idf_line) elif isinstance(magetab_label, list): if re.search('Person Last Name', magetab_label[0]): # Special handling/parsing - Contributors contact_rows = OrderedDict() for row_label in magetab_label: contact_rows[row_label] = [] for contact in utils.get_hca_value( hca_path[1:], json_dict, logger, config, True, magetab_label, context): contact_name_arr = contact[ 'contact_name'].split(',') contact_rows['Person Last Name'].append( contact_name_arr[0]) contact_rows['Person First Name'].append( contact_name_arr[-1].lstrip()) if len(contact_name_arr) == 3: contact_rows[ 'Person Mid Initials'].append( contact_name_arr[1]) for contact in utils.get_hca_value( hca_path[1:], json_dict, logger, config, True, magetab_label, context): email = utils.get_hca_value( ['email'], contact, logger, config, True, magetab_label, context) contact_rows['Person Email'].append( email if email != utils. get_val(config, 'notfound') else '') contact_rows['Person Affiliation'].append( contact['institution']) for contact in utils.get_hca_value( hca_path[1:], json_dict, logger, config, True, magetab_label, context): address = utils.get_hca_value( ['address'], contact, logger, config, True, magetab_label, context) contact_rows['Person Address'].append( address if address != utils. get_val(config, 'notfound') else '') for key in list(contact_rows.keys()): csvwriter.writerow([key] + contact_rows[key]) elif 'Protocol Name' == magetab_label[0]: # Special handling/parsing - Protocols protocol_rows = OrderedDict() for row_label in magetab_label: protocol_rows[row_label] = [] for protocol_type in technology2protocol_type2protocols[ technology].keys(): # Traverse through protocol tuples in alphabetic order - by protocol name for protocol_tuple in sorted( technology2protocol_type2protocols[ technology][protocol_type], key=lambda x: x[0]): protocol_rows['Protocol Name'].append( protocol_tuple[0]) protocol_rows[ 'Protocol Description'].append( protocol_tuple[1] if protocol_tuple[1] != utils. get_val(config, 'notfound') else '') protocol_rows['Protocol Type'].append( protocol_tuple[2] if protocol_tuple[2] != utils.get_val( config, 'notfound') else '') for key in list(protocol_rows.keys()): csvwriter.writerow([key] + protocol_rows[key]) elif re.search('Publication Title', magetab_label[0]): if utils.get_hca_value( hca_path[1:], json_dict, logger, config, True, magetab_label[0], context) == utils.get_val( config, 'notfound'): # Skip the publications-related idf config continue # Special handling/parsing - Publications publication_rows = OrderedDict() for row_label in 'Publication Title', 'Publication Author List', 'PubMed ID', 'Publication DOI': publication_rows[row_label] = [] for publication in utils.get_hca_value( hca_path[1:], json_dict, logger, config, True, magetab_label, context): publication_rows[ 'Publication Title'].append( utils.get_hca_value( utils.get_val( config, 'hca_publication_title_path' ), publication, logger, config, True, magetab_label, context)) publication_rows[ 'Publication Author List'].append( ', '.join( utils.get_hca_value( utils.get_val( config, 'hca_publication_authors_path' ), publication, logger, config, True, magetab_label, context))) pubmed_id = utils.get_hca_value( utils.get_val( config, 'hca_publication_pmid_path'), publication, logger, config, True, magetab_label, context) publication_rows['PubMed ID'].append( str(pubmed_id ) if str(pubmed_id) != utils. get_val(config, 'notfound') else '') publication_doi = utils.get_hca_value( utils.get_val( config, 'hca_publication_doi_path'), publication, logger, config, True, magetab_label, context) publication_rows['Publication DOI'].append( publication_doi if publication_doi != utils. get_val(config, 'notfound') else '') for key in list(publication_rows.keys()): csvwriter.writerow([key] + publication_rows[key]) else: # magetab_label is not a list or a special case csvwriter.writerow([magetab_label, value]) if magetab_label == 'Investigation Title': imported_experiments.append( "%s (%s - %d bundles): %s" % (accession, technology, len(hca_json_for_project_uuid.keys()), value)) else: # hca_path is not a list csvwriter.writerow(line_item) time_end = utils.unix_time_millis(datetime.now()) duration = (time_end - time_start) / 1000 / 60 logger.info( "Processing HCA study uuid: %s for gxa accession: %s took %d mins" % (project_uuid, accession, duration)) if imported_experiments and sender and email_recipients: utils.email_report("New experiments imported from HCA DCC", '\n'.join(imported_experiments), sender, email_recipients)
def parse_msf_module_local(self, target_file): regex_pattern = { 'module_info': r"initialize[\s\S]*?end\n", 'module_title': r"['|\"]Name['|\"][ |\t|\S]+['|\"|\)]", 'module_describe': r"['|\"]Description['|\"][\s\S]*?['|\"|\)],\n|['|\"]Description['|\"][^\}]+},\n", 'module_authors': r"['|\"]Author['|\"][^\]]+\],\n|['|\"]Author['|\"][ |\t|\S]+['|\"|\)|\]],\n", 'module_cve': r"['|\"]CVE['|\"],\s['|\"]\d{4}-\d+['|\"]", 'module_edb': r"['|\"]EDB['|\"],\s['|\"]\d+['|\"]", 'module_cwe': r"['|\"]CWE['|\"],\s['|\"]\d+['|\"]", 'module_bid': r"['|\"]BID['|\"],\s['|\"]\d+['|\"]", 'module_zdi': r"['|\"]ZDI['|\"],\s['|\"]\d{2}-\d+['|\"]", 'module_msb': r"['|\"]MSB['|\"],\s['|\"]MS\d{2}-\d+['|\"]", 'module_osvdb': r"['|\"]OSVDB['|\"],\s['|\"]\d+['|\"]", 'module_wpvdb': r"['|\"]WPVDB['|\"],\s['|\"]\d+['|\"]", 'module_uscert': r"['|\"]US-CERT-VU['|\"],\s['|\"]\S+['|\"]", 'module_packet': r"['|\"]PACKETSTORM['|\"],\s['|\"]\S+['|\"]", 'module_ref_url': r"['|\"]URL['|\"],\s['|\"]\S+['|\"]", 'module_platforms_fmt': r"['|\"]Platform['|\"][ |\t]+=>[ |\t]%+[^\}]+},\n", 'module_platforms': r"['|\"]Platform['|\"][ |\t|\S]+['|\"|\)|\]],\n|['|\"]Platform['|\"][^\}]+},\n", 'module_disclosure_date': r"['|\"]DisclosureDate['|\"][ |\t|\S]+['|\"],*\n", } file_obj = open(target_file, "r") source_code = file_obj.read() update_info_code = get_val( re.findall(regex_pattern['module_info'], source_code)) module_name_start_pos = target_file.find("modules") module_name = target_file[module_name_start_pos:] module_class = module_name.split(PATH_SPLIT)[1] module_url = f"https://www.rapid7.com/db/modules/{module_name}".replace( "exploits", "exploit").replace(".rb", "") module_title = self.optimize_title( get_val(re.findall(regex_pattern['module_title'], update_info_code))) module_describe_words = self.optimize_describe( get_val( re.findall(regex_pattern['module_describe'], update_info_code))).split() module_describe = ' '.join(module_describe_words) # TODO: Efficient author's parsing method # module_authors = get_val(re.findall(regex_pattern['module_authors'], update_info_code)) module_cve = self.optimize_ref_id( get_val(re.findall(regex_pattern['module_cve'], update_info_code))) module_edb = self.optimize_ref_id( get_val(re.findall(regex_pattern['module_edb'], update_info_code))) module_cwe = self.optimize_ref_id( get_val(re.findall(regex_pattern['module_cwe'], update_info_code))).split(",") module_bid = self.optimize_ref_id( get_val(re.findall(regex_pattern['module_bid'], update_info_code))).split(",") module_zdi = self.optimize_ref_id( get_val(re.findall(regex_pattern['module_zdi'], update_info_code))).split(",") module_msb = self.optimize_ref_id( get_val(re.findall(regex_pattern['module_msb'], update_info_code))).split(",") module_osvdb = self.optimize_ref_id( get_val(re.findall(regex_pattern['module_osvdb'], update_info_code))).split(",") module_wpvdb = self.optimize_ref_id( get_val(re.findall(regex_pattern['module_wpvdb'], update_info_code))).split(",") module_uscert = self.optimize_ref_id( get_val( re.findall(regex_pattern['module_uscert'], update_info_code))).split(",") module_packet = self.optimize_ref_id( get_val( re.findall(regex_pattern['module_packet'], update_info_code))).split(",") module_ref_url = self.optimize_ref_url( get_val( re.findall(regex_pattern['module_ref_url'], update_info_code))) module_ref_list = module_cwe + module_bid + module_zdi + module_msb + \ module_osvdb + module_wpvdb + module_uscert + module_packet + module_ref_url module_ref_list = list(filter(lambda str: str != '', module_ref_list)) module_references = get_val(module_ref_list) try: module_platforms = self.optimize_platforms( re.findall(regex_pattern['module_platforms_fmt'], update_info_code)[0]) except IndexError: try: module_platforms = self.optimize_platforms( re.findall(regex_pattern['module_platforms'], update_info_code)[0]) except IndexError: module_platforms = "" module_remote_ports = self.optimize_remote_port(source_code) module_disclosure_date = self.optimize_disclosure_date( get_val( re.findall(regex_pattern['module_disclosure_date'], update_info_code))) module_collect_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) file_obj.close() msf_record = MsfRecord(module_name=module_name, module_class=module_class, module_title=module_title, module_url=module_url, module_describe=module_describe, module_cve=module_cve, module_edb=module_edb, module_references=module_references, module_platforms=module_platforms, module_remote_ports=module_remote_ports, module_disclosure_date=module_disclosure_date, module_collect_date=module_collect_date) self.insert_record(msf_record)
def parse_msf_module(self, item): url = "https://www.rapid7.com{}".format(item) module_item = self.request(url) if module_item.status_code != 200: msf_record = MsfRecord(module_name=item[11:]) self.msf_dao.add(msf_record) element_xpath = { 'module_title': '//div[@class="vulndb__detail-main"]/h3/text()', 'module_url': '/html/head/link[@rel="canonical"]/@href', 'module_devlink': '//section[contains(@class,"vulndb__solution")]/ul/li[1]/a/@href', 'module_describe': '//div[contains(@class,"vulndb__detail-content")]/p/text()', 'module_authors': '//div[contains(@class,"vulndb__detail-content")]/ul/li/text()', 'module_references': '//section[contains(@class,"vulndb__references")]/ul/li//text()', 'module_platforms': '//div[contains(@class,"vulndb__detail-content")]/p[2]/text()', 'module_architectures': '//div[contains(@class,"vulndb__detail-content")]/p[3]/text()', } module_url = get_val( module_item.html.xpath(element_xpath["module_url"])) code_link = get_val( module_item.html.xpath(element_xpath["module_devlink"])) module_name = code_link[60:] module_title = get_val( module_item.html.xpath(element_xpath["module_title"])) module_describe_words = module_item.html.xpath( element_xpath["module_describe"])[0].split() module_describe = ' '.join(module_describe_words) module_authors = get_val( module_item.html.xpath(element_xpath["module_authors"])) module_references = get_val( module_item.html.xpath(element_xpath["module_references"])) module_cve = "" module_edb = "" # Extracting CVEs&EDBs from reference information if module_references is not None: cve_list = [] edb_list = [] pattern = r"CVE-\d{4}-\d+|EDB-\d+" module_cve_edb_list = re.findall(pattern, module_references) exclusion_pattern = r"CVE-\d{4}-\d+,?|EDB-\d+,?" module_references = re.sub(exclusion_pattern, "", module_references) for item in module_cve_edb_list: if "CVE" in item: cve_list.append(item) elif "EDB" in item: edb_list.append(item) if len(cve_list) >= 1: module_cve = ','.join(cve_list) if len(edb_list) >= 1: module_edb = ','.join(edb_list) module_platforms = get_val( module_item.html.xpath(element_xpath["module_platforms"])) module_architectures = get_val( module_item.html.xpath(element_xpath["module_architectures"])) modified_date = MsfCollector.get_modified_date(module_name) module_update_date = parser.parse(modified_date).strftime( "%Y-%m-%d %H:%M:%S") module_collect_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) msf_record = MsfRecord(module_name=module_name, module_title=module_title, module_url=module_url, module_describe=module_describe, module_authors=module_authors, module_cve=module_cve, module_edb=module_edb, module_references=module_references, module_platforms=module_platforms, module_architectures=module_architectures, module_update_date=module_update_date, module_collect_date=module_collect_date) self.insert_record(msf_record)