def _find_lexically_based_slink(self, sentence, event_expr): """Try to find lexically based Slinks for an instance of EventExpression using forward, backward and reporting FSA paterns. No return value, if an Slink is found, it will be created by the chunk that embeds the Slink triggering event.""" evNode = sentence[event_expr.locInSent] if evNode is None: logger.error("No node found at locInSent=%s" % event_expr.locInSent) return slink_created = False logger.debug("Sentence element class: %s" % evNode.__class__.__name__) forwardFSAs = event_expr.slinkingContexts('forward') if forwardFSAs: logger.debug("Applying FORWARD slink FSAs") slink_created = evNode.find_forward_slink(forwardFSAs) logger.debug("forward slink created = %s" % slink_created) if not slink_created: backwardFSAs = event_expr.slinkingContexts('backwards') if backwardFSAs: logger.debug("Applying BACKWARD slink FSAs") slink_created = evNode.find_backward_slink(backwardFSAs) logger.debug("backward slink created = %s" % slink_created) if not slink_created: reportingFSAs = event_expr.slinkingContexts('reporting') if reportingFSAs: logger.debug("Applying REPORTING slink FSAs") slink_created = evNode.find_reporting_slink(reportingFSAs) logger.debug("reporting slink created = %s" % slink_created)
def process_fragments(self): """Set fragment names, create the vectors for each fragment, run the classifier and add links from the classifier to the fragments.""" os.chdir(self.DIR_LINK_MERGER + os.sep + 'sputlink') perl = '/usr/local/ActivePerl-5.8/bin/perl' perl = 'perl' perl = self.tarsqi_instance.getopt_perl() for fragment in self.fragments: # set fragment names base = fragment[0] in_fragment = os.path.join(self.DIR_DATA, base+'.'+self.CREATION_EXTENSION) tmp_fragment = os.path.join(self.DIR_DATA, base+'.'+self.TMP_EXTENSION) out_fragment = os.path.join(self.DIR_DATA, base+'.'+self.RETRIEVAL_EXTENSION) # process them command = "%s merge.pl %s %s" % (perl, in_fragment, tmp_fragment) (i, o, e) = os.popen3(command) for line in e: if line.lower().startswith('warn'): logger.warn('MERGING: ' + line) else: logger.error('MERGING: ' + line) for line in o: logger.debug('MERGING: ' + line) self._add_tlinks_to_fragment(in_fragment, tmp_fragment, out_fragment) os.chdir(TTK_ROOT)
def addInPreviousSublist(self, list, element): if len(list) == 0 and self.counter == 0: list.append([element]) elif len(list) >= self.counter - 1: list[self.counter - 1].append(element) else: logger.error("ERROR: list should be longer")
def run(self): while not self.stop_run: try: self.push_to_server() except RuntimeError as e: logger.error(str(e)) time.sleep(self.time_interval)
def process_item(self, item, spider): db_write = getattr(spider, 'db_write', None) if db_write: try: article, url_inserted = Article.get_or_create( article_url=item['article_url']) if url_inserted: subs_article, relation_created = SubscriptionArticle.get_or_create( subscription=item['subscription_id'], article=article.id) logger.info( 'article_url [ID:%s] is now associated with index_url [ID:%s]', subs_article.article, subs_article.subscription) else: subs_article, relation_created = SubscriptionArticle.get_or_create( subscription=item['subscription_id'], article=article.id) if not relation_created: logger.info( 'relation between article_url [ID:%s] and index_url [ID:%s] has been ignored', subs_article.article, subs_article.subscription) else: logger.info( 'article_url [ID:%s] has created a new relationship with index_url [ID:%s]', subs_article.article, subs_article.subscription) except (RuntimeError, KeyError, NameError) as e: logger.error('%s happened when handling %s', str(e), item['article_url']) raise RuntimeError('Error received from Scrapy Pipelines')
def _feature_extract(problem_name, filelist, path, problematic_filename, silent, with_existed_merged_file=False): problematic_files = [] for file in filelist: cleaner.quote_cleaner(file, file + QUOTE_CLEAN_SUFFIX) if silent: try: _feature_extraction(problem_name, file + QUOTE_CLEAN_SUFFIX, path, with_existed_merged_file) except Exception: problematic_files.append(file + QUOTE_CLEAN_SUFFIX) else: _feature_extraction(problem_name, file + QUOTE_CLEAN_SUFFIX, path, with_existed_merged_file) if problematic_filename != None: problematic_filename = os.path.join(LOG_DATA_PATH, problematic_filename) if problematic_filename.endswith(QUOTE_CLEAN_SUFFIX): problematic_filename = problematic_filename[:-len( QUOTE_CLEAN_SUFFIX)] f = open(problematic_filename, 'w') for file in problematic_files: logger.error( '========================PROBLEM===========================') logger.error(file) f.write(file) f.write('\n') f.close()
def _get_default(self, doctype, feature, fallback): """Returns a default for a particular feature given a document type. Return the fallback is no value is available""" try: return DSI_DEFAULTS[doctype][feature] except KeyError: logger.error("No default %s for document type %s" % (feature, doctype)) return fallback
def __init__(self, pipeline=None, dom_node=None): """Initialize from a pipeline or a DOM node.""" if pipeline is not None: self._initialize_from_pipeline(pipeline) elif dom_node is not None: self._initialize_from_dom_node(dom_node) else: logger.error("ProcessingStep cannot be initialized")
def _write_output(self): """Write the TarsqiDocument to the output file.""" if self.options.trap_errors: try: self.tarsqidoc.print_all(self.output) except: logger.error("Writing output failed") else: self.tarsqidoc.print_all(self.output)
def createTLinksFromSLinks(self): """Calls lookForStlinks for a given Slink object.""" logger.debug("Number of SLINKs in file: " + str(len(self.slinks))) for slinkTag in self.slinks: try: slink = Slink(self.xmldoc, self.doctree, slinkTag) slink.match_rules(self.rules) except: logger.error("Error processing SLINK")
def check_trailing_cruft(ignore_paths=[], exit=True): """ Recursively finds all files relative to CWD and checks them for trailing whitespace and newlines :param ignore_paths: list of paths to ignore during checks :return: """ filenames = [] pruned_filenames = [] found_error = False valid_extensions = ['py', 'yml', 'rb'] for root, dirs, files in os.walk('.'): # gets ./subdirectory/filename filenames.extend([os.path.join(root, name) for name in files if name.split(os.extsep)[-1] in valid_extensions]) # gets ./filename filenames.extend([os.path.join(root, name) for name in dirs if name.split(os.extsep)[-1] in valid_extensions]) # only work on files not in our ignore paths for f in filenames: f_parts = f.split(os.sep) try: if f_parts[1] in ignore_paths: continue except IndexError: continue # don't add directories if os.path.isfile(f): pruned_filenames.append(f) for filename in pruned_filenames: # don't process blank files if os.path.getsize(filename) < 1: continue data = [line for line in open(filename, 'r')] newline = trailing_newline(data) whitespace = trailing_whitespace(data) if newline: error = '{}Trailing newline found at the end of {}{}\n' logger.error(error.format(colorama.Fore.RED, filename, colorama.Fore.RESET)) found_error = True if whitespace: error = '{}Trailing whitespace found in {} on lines: {}{}\n' lines = ', '.join(str(x) for x in whitespace) logger.error(error.format(colorama.Fore.RED, filename, lines, colorama.Fore.RESET)) found_error = True if exit and found_error: sys.exit(1)
def createTLinksFromSLinks(self): """Calls lookForStlinks for a given Slink object.""" logger.debug("Number of SLINKs in file: "+str(len(self.slinks))) for slinkTag in self.slinks: try: slink = Slink(self.xmldoc, self.doctree, slinkTag) slink.match_rules(self.rules) except: logger.error("Error processing SLINK")
def apply_component(self, name, wrapper, infile, outfile): """Apply a component if the processing parameters determine that the component needs to be applied. This method passes the content tag and the xml_document to the wrapper of the component and asks the wrapper to process the document fragments. Component-level errors are trapped here if trap_errors is True. Arguments: name - string, the name of the component wrapper - instance of a subclass of ComponentWrapper infile - string outfile - string Return value: None""" # NOTES # - Components still write results to file, which is not # conform to the specs. But writing files to disk is but a # minor part of processing time so for now we'll leave it # here and let all components assume that there is an input # file to work with. # - Having said that, it is not quite true that the wrappers # use the input file. The wrappers use the xml document and # the content tag and then (i) create fragments from the xml # doc, (ii) process the fragments, (iii) reinsert the # fragments in the xml doc, and (iv) write the xml doc to a # file. But the file rated is not opened by the next # wrapper. # - Errors are now trapped here instead of in the component # since we do not tell the component what the output file # is. def call_wrapper(wrapper, content_tag, xmldoc, trap_errors, outfile): wrapper(content_tag, xmldoc, self).process() self.xml_document.save_to_file(outfile) logger.info("RUNNING " + name + " on: " + infile) #logger.out('Running', name) trap_errors = self.getopt_trap_errors() if trap_errors: try: call_wrapper(wrapper, self.content_tag, self.xml_document, trap_errors, outfile) except: logger.error(name + " error on " + infile + "\n\t" + str(sys.exc_type) + "\n\t" + str(sys.exc_value) + "\n") shutil.copy(infile, outfile) else: call_wrapper(wrapper, self.content_tag, self.xml_document, trap_errors, outfile)
def get_event_attribute(self, attr, optional=False): """Return the value of an attribute 'attr' from self.dict. If the attribute is not in the dictionary, then (i) return a default value if there is one, and (ii) write an error if the attribute is not optional.""" val = self.dict.get(attr) if val is None and not optional: logger.error("No %s attribute for current event" % attr) if val is None and attr == LIBRARY.timeml.POL: val = 'POS' return val
def _run_gutime_on_string(input_string): """Run the GUTIME Perl script. This takes a string and returns a string.""" command = ["perl", "TimeTag.pl"] pipe = subprocess.PIPE close_fds = False if sys.platform == 'win32' else True p = subprocess.Popen(command, stdin=pipe, stdout=pipe, stderr=pipe, close_fds=close_fds) (result, error) = p.communicate(input_string) if error: logger.error(error) return result
def _initialize_nodes(self): """Given the VerbChunk or a list of Tokens, set the nodes variable to either the daughters of the VerbChunk or the list of Tokens. Also sets node and tokens, where the first one has the VerbChunk or None (this is so we can hand the chunk to GramVChunk, following GramChunk behaviour), and where the second one is the list of Tokens or None.""" if self.node: self.nodes = self.node.dtrs elif self.tokens: self.nodes = self.tokens else: logger.error("Incorrect initialization of GramVChunkList")
def _initialize_nodes(self): """Given the VerbChunk or a list of Tokens, set the nodes variable to either the daughters of the VerbChunk or the list of Tokens. Also sets node and tokens, where the first one has the VerbChunk or None (this is so we can hand the chunk to VChunkFeatures instance, following ChunkFeatures behaviour), and where the second one is the list of Tokens or None.""" if self.node: self.nodes = self.node.dtrs elif self.tokens: self.nodes = self.tokens else: logger.error("Incorrect initialization of VChunkFeaturesList")
def createTLinksFromALinks(self): """Calls alink.lookForAtlinks to add Tlinks from Alinks. This is rather moronic unfortunately because it will never do anything because at the time of application there are no tlinks in the document. Needs to be separated out and apply at a later processing stage, after all other tlinking.""" logger.debug("Number of ALINKs in file: "+str(len(self.alinks))) for alinkTag in self.alinks: try: alink = Alink(self.xmldoc, self.doctree, alinkTag) alink.lookForAtlinks() except: logger.error("Error processing ALINK")
def _get_executable(self): """Get the TreeTagger executable for the platform.""" if sys.platform == "win32": executable = os.path.join(self.bindir, WINDOWS_EXECUTABLE) elif sys.platform == "linux2": executable = os.path.join(self.bindir, LINUX_EXECUTABLE) elif sys.platform == "darwin": executable = os.path.join(self.bindir, MAC_EXECUTABLE) else: logger.error("No binary for platform %s" % sys.platform) if not os.path.isfile(executable): logger.error("TreeTagger binary invalid: %s" % executable) return executable
def process_doctree(self, doctree): """Apply all S2T rules to doctree.""" self.doctree = doctree self.docelement = self.doctree.docelement events = self.doctree.tarsqidoc.tags.find_tags('EVENT') eventsIdx = dict([(e.attrs['eiid'], e) for e in events]) for slinktag in self.doctree.slinks: slink = Slink(self.doctree, eventsIdx, slinktag) try: slink.match_rules(self.rules) except: logger.error("S2T Error when processing Slink instance") self._add_links_to_docelement()
def createTLinksFromALinks(self): """Calls alink.lookForAtlinks to add Tlinks from Alinks. This is rather moronic unfortunately because it will never do anything because at the time of application there are no tlinks in the document. Needs to be separated out and apply at a later processing stage, after all other tlinking.""" logger.debug("Number of ALINKs in file: " + str(len(self.alinks))) for alinkTag in self.alinks: try: alink = Alink(self.xmldoc, self.doctree, alinkTag) alink.lookForAtlinks() except: logger.error("Error processing ALINK")
def __getitem__(self, idx): # return the idx-th sample try: focus_file = np.load(self.data_path + self.index_to_file[idx]) transform_no = self.index_to_transform[idx] bsp_data = focus_file[transform_no] label = focus_file[self.label_name] return bsp_data, label except: error('Error loading file!', self.data_path, idx, self.index_to_file[idx])
def __iter__(self): for root, subdirs, files in os.walk(self.source): for fname in files: full_path = os.path.join(root, fname) with open(full_path, encoding='utf8') as file: try: text = file.read() text = text.replace('\n', '') words = self.lexicon.review_to_wordlist(utils.to_unicode(text)) tag = Constants.extract_tag(full_path) tags = [tag] yield TaggedDocument(words=words, tags=tags) except: logger.error("failed processing file: %s", fname)
def _find_alink(self, sentence, event_expr): """Try to find an alink with event_expr as the trigger, alinks are created as a side effect.""" evNode = sentence[event_expr.locInSent] if evNode is None: logger.error("No node found at locInSent=%s" % event_expr.locInSent) return forwardFSAs = event_expr.alinkingContexts('forward') if forwardFSAs: alink_created = evNode.find_forward_alink(forwardFSAs) if not alink_created: backwardFSAs = event_expr.alinkingContexts('backwards') if backwardFSAs: evNode.find_backward_alink(backwardFSAs)
def get_event_attribute(self, attr, optional=False): """Return the value of an attribute from self.dict. If the attribute is not in the dictionary, then (i) return a default value, and (ii) write an error if the attribute is not optinal. Arguments: attr - a string optional - a boolean""" try: return self.dict[attr] except KeyError: if not optional: logger.error("No %s attribute for current event" % attr) if attr == POL: return 'POS' return None
def measure_performance_auc(test_y, result_y, result_y_prob): try: vacc = metrics.accuracy_score(test_y, result_y) # find validation AUC if len(np.unique(test_y)) == 2: vauc = roc_auc_score(test_y, result_y_prob) logger.info('Accurary: {0:.3f} and AUC {1:.3f}'.format( vacc, vauc)) else: vauc = None logger.info('Accurary: {0:.3f}'.format(vacc)) return vacc, vauc except: logger.error("Error calculating metrics")
def execute(self): """ Executes ansible-galaxy install :return: sh.stdout on success, else None :return: None """ if self.galaxy is None: self.bake() try: return self.galaxy().stdout except sh.ErrorReturnCode as e: logger.error('ERROR: {}'.format(e)) sys.exit(e.exit_code)
def process_doctree(self, doctree): """Apply all S2T rules to doctree.""" self.doctree = doctree # For sanity we clean out the tlinks since we are adding new tlinks to # the document, if we don't do this we might add some links twice. self.doctree.tlinks = [] self.docelement = self.doctree.docelement events = self.doctree.tarsqidoc.tags.find_tags(LIBRARY.timeml.EVENT) eventsIdx = dict([(e.attrs['eiid'], e) for e in events]) for slinktag in self.doctree.slinks: slink = Slink(self.doctree, eventsIdx, slinktag) try: slink.match_rules(self.rules) except: logger.error("S2T Error when processing Slink instance") self._add_links_to_tarsqidoc()
def execute(self, hide_errors=False): """ Executes ansible-playbook :returns: exit code if any, output of command as string """ if self.ansible is None: self.bake() try: return None, self.ansible().stdout except (sh.ErrorReturnCode, sh.ErrorReturnCode_2) as e: if not hide_errors: logger.error('ERROR: {}'.format(e)) return e.exit_code, None
def get_tokens_from_sequence(sequence): """Given a sequence of elements, collect all the token leaves and return them as a list.""" # TODO: this can probably use get_tokens tokens = [] for item in sequence: if item.isToken(): tokens.append(item) elif item.isChunk(): tokens += get_tokens(item) elif item.isEvent(): tokens.append(item) elif item.isTimex(): tokens += get_tokens(item) else: logger.error("unknown item type: %s" % item.__class__.__name__) return tokens
def _apply_component(self, name, wrapper, tarsqidocument): """Apply a component by taking the TarsqDocument, which includes the options from the Tarsqi instance, and passing it to the component wrapper. Component-level errors are trapped here if --trap-errors is True. If errors are trapped, it is still possible that partial results were written to the TagRepositories in the TarsqiDocument.""" logger.info(name + '............') t1 = time.time() if self.options.trap_errors: try: wrapper(tarsqidocument).process() except: logger.error("%s error:\n\t%s\n\t%s\n" % (name, sys.exc_type, sys.exc_value)) else: wrapper(tarsqidocument).process() logger.info("%s DONE (%.3f seconds)" % (name, time.time() - t1))
def _print_tags(self, fh, tag_group, tags): fh.write("<%s>\n" % tag_group) for tag in sorted(tags): try: ttk_tag = tag.as_ttk_tag() # This became needed after allowing any text in the value of the # form and lemma attribute. if isinstance(ttk_tag, str): ttk_tag = unicode(ttk_tag, errors='ignore') fh.write(" %s\n" % ttk_tag) except UnicodeDecodeError: # Not sure why this happened, but there were cases where the # result of as_ttk_tag() was a byte string with a non-ascii # character. The code in the try clause was changed to prevent # the error, but leave the except here just in case. logger.error("UnicodeDecodeError on printing a tag.") fh.write("</%s>\n" % tag_group)
def get_tokens_from_sequence(sequence): """Given a sequence of elements, collect all the token leaves and return them as a list.""" # TODO: this can probably use get_tokens tokens = [] for item in sequence: if item.isToken(): tokens.append(item) elif item.isChunk(): tokens += get_tokens(item) elif item.isEvent(): tokens += get_tokens(item) elif item.isTimex(): tokens += get_tokens(item) else: logger.error("unknown item type: %s" % item.__class__.__name__) return tokens
def diffbot_article_api(self, query_limit=0): resp = [] exceptions = [] if query_limit <= 0: self.urls = (Article .select() .where(Article.status == 0)) else: self.urls = (Article .select() .where((Article.status == 0) | (Article.status == 1)) .order_by(fn.Random()) .limit(query_limit)) for url in self.urls: try: print(url.article_url) response = self.diffbot.request( url.article_url, self.token, 'article') pp = pprint.PrettyPrinter(indent=4) # print(pp.pprint(response)) title = response['objects'][0]['title'] if response['objects'][0]['text'] == "": u = Article.update( title=title, modified_utc=datetime.utcnow(), ).where(Article.id == url.id) else: resp.append(response['objects'][0]) u = Article.update( status=1, title=title, content=response['objects'][0], modified_utc=datetime.utcnow(), ).where(Article.id == url.id) u.execute() except Exception as e: logger.error('%s happened when handling %s', str(e), url) exceptions.append(e) continue return resp if exceptions: raise RuntimeError('Error received from Diffbot')
def run_timex_linking(self): """Apply the rules that govern relations between TIMEX3 tags. Only applies to TIMEX3 tags with type=DATE.""" # TODO: add a DCT TIMEX tag if it is not in the tags dictionary, but # maybe check first whether it is in the dictionary in case we care # about duplications (see https://github.com/tarsqi/ttk/issues/10 and # https://github.com/tarsqi/ttk/issues/13) timexes = self.tarsqidoc.tags.find_tags(TIMEX) timexes = [t for t in timexes if t.attrs[TYPE] == 'DATE'] pairs = _timex_pairs(timexes) for timex1, timex2 in pairs: if self.tarsqidoc.options.trap_errors: try: self._create_timex_link(timex1, timex2) except Exception: logger.error("Error linking:\n%s\n%s" % (timex1, timex2)) else: self._create_timex_link(timex1, timex2)
def _run_timex_linking(self): """Apply the rules that govern relations between TIMEX3 tags. Only applies to TIMEX3 tags with a VAL attribute equal to DATE.""" timexes = [timex for timex in self.xmldoc.get_tags(TIMEX) if timex.attrs['TYPE'] == 'DATE'] for t in timexes: if t.attrs.get('VAL', None) is None: logger.warn("Missing VAL: %s" % t.get_content()) for i in range(len(timexes)): for j in range(len(timexes)): if i < j: try: self._create_timex_link(timexes[i], timexes[j]) except Exception: logger.error("Error in Timex Linking:\n%s\n%s" % \ (timexes[i].get_content(), timexes[j].get_content()))
def _find_lexically_based_slinks(self, event_expr): """Try to find lexically based Slinks using forward, backward and reporting FSA paterns. No return value, if an Slink is found, it will be created by the chunk that embeds the Slink triggering event. Arguments: event_expr - an EventExpression""" evNode = self.currSent[event_expr.locInSent] #logger.out('trying slink') if evNode is None: logger.error("No event node found at locInSent") forwardFSAs = event_expr.slinkingContexts('forward') if forwardFSAs: #logger.out('found', len(forwardFSAs[0]), 'groups of forwardFSAs') evNode.find_forward_slink(forwardFSAs) if evNode.createdLexicalSlink: #logger.out('created slink') evNode.createdLexicalSlink = 0 return backwardFSAs = event_expr.slinkingContexts('backwards') if backwardFSAs: #logger.out('found', len(backwardFSAs[0]), 'groups of backwardFSAs') logger.debug("PROCESS for BACKWARD slinks") evNode.find_backward_slink(backwardFSAs) if evNode.createdLexicalSlink: evNode.createdLexicalSlink = 0 return reportingFSAs = event_expr.slinkingContexts('reporting') if reportingFSAs: #logger.out('found', len(reportingFSAs[0]), 'groups of reportingFSAs') logger.debug("PROCESS for REPORTING slinks") evNode.find_reporting_slink(reportingFSAs) if evNode.createdLexicalSlink: evNode.createdLexicalSlink = 0
def setup_docmodel(self, tarsqi_instance): """Initialize the document_model and processing_parameters instance variables of a TarsqiControl instance, using its data source identifier and processing options. Arguments: tarsqi_instance - a TarsqiControl instance No return value.""" tarsqi_instance.processing_parameters = ProcessingParameters(tarsqi_instance) data_source_identifier = tarsqi_instance.data_source_identifier constructor = self.dsi_to_docmodelconstructor.get(data_source_identifier, None) try: constructor(tarsqi_instance) except TypeError, e: # log error and use simple-xml as a default logger.error("Unknown data source identifier, using simple-xml") tarsqi_instance.data_source_identifier = 'simple-xml' data_source_identifier = tarsqi_instance.data_source_identifier self._setup_docmodel_simple_xml(tarsqi_instance)
def process(self, infile, outfile): """Ask the component to process a file fragment. This is the method that is called from the component wrappers and it should be overwritten on all subclasses. An error is written to the log file if this method is ever executed.""" logger.error("TarsqiComponent.process() not overridden")