def cut(self, sentence, punctuation=True): sentence_length = len(sentence) cursor = 0 while cursor < sentence_length: if self.is_chinese_char(sentence[cursor]): DEBUG('begin pos 中文') chunks = self.__get_chunks(sentence, cursor) # Matching Algorithm if len(chunks) > 0: words, length = self.__ambiguity_resolution( chunks) # Ambiguity Resolution Rules for term in list(filter(None, words)): yield term # Set the next begin pos cursor += length else: # for a single char in Chinese yield sentence[cursor] cursor += 1 elif self.V.is_punctuation(sentence[cursor]) and punctuation: DEBUG("begin pos 字符") yield sentence[cursor] cursor += 1 else: DEBUG('begin pos 非中文单词(英文单词, etc.)') word, cursor = self.__match_none_chinese_words( sentence, cursor) yield word DEBUG("cursor %s" % cursor) DEBUG("char %s" % sentence[cursor - 1])
def __match_none_chinese_words(sentence, begin_pos): ''' 切割出非中文词 ''' DEBUG('__match_none_chinese_words %s' % sentence[begin_pos]) cursor = begin_pos # Skip pre-word whitespaces and punctuations # 跳过中英文标点和空格 while cursor < len(sentence): ch = sentence[cursor] if Tokenizer.is_ascii_char(ch) or Tokenizer.is_chinese_char(ch): break cursor += 1 # 得到英文单词的起始位置 start = cursor # 找出英文单词的结束位置 while cursor < len(sentence): ch = sentence[cursor] if not Tokenizer.is_ascii_char(ch): break cursor += 1 end = cursor #Skip chinese word whitespaces and punctuations # 跳过中英文标点和空格 while cursor < len(sentence): ch = sentence[cursor] if Tokenizer.is_ascii_char(ch) or Tokenizer.is_chinese_char(ch): break cursor += 1 # 返回英文单词和游标地址 return sentence[start:end], cursor
def __ambiguity_resolution(self, chunks): ''' 根据当前游标位置进行切词 ''' DEBUG('__ambiguity_resolution 开始消岐') for x in chunks: [DEBUG(y.text) for y in x.words] DEBUG('-' * 20) if len(chunks) > 1: # Rule 1: 根据 total_word_length 进行消岐 DEBUG("# Rule 1: 根据 total_word_length 进行消岐") score = max([x.total_word_length for x in chunks]) chunks = list(filter(None, \ [ x if x.total_word_length == score \ else None for x in chunks])) # for x in chunks: [DEBUG(y.text) for y in x.words]; DEBUG('-'*20) if len(chunks) > 1: # Rule 2: 根据 average_word_length 进行消岐 DEBUG("# Rule 2: 根据 average_word_length 进行消岐") score = max([x.average_word_length for x in chunks]) chunks = list(filter(None, \ [ x if x.average_word_length == score \ else None for x in chunks])) if len(chunks) > 1: # Rule 3: 根据 standard_deviation 进行消岐 score = max([x.standard_deviation for x in chunks]) chunks = list(filter(None, \ [ x if x.standard_deviation == score \ else None for x in chunks])) if len(chunks) > 1: # Rule 4: 根据 word_frequency 进行消岐 score = max([x.word_frequency for x in chunks]) chunks = list(filter(None, \ [ x if x.word_frequency == score \ else None for x in chunks])) if len(chunks) != 1: ''' 分词失败 ''' DEBUG("分词失败") Tokenizer.__print_chunks(chunks) return '', 1 words = chunks[0].words return [w.text for w in words], reduce(lambda x, y: x + y.length, words, 0)
smpv.setDebug(debug) if smpv.connected is not True: print('An issue with connection to the SolarmanPV API') sys.exit(1) power_details = smpv.getPower(datetime.date.today().strftime("%Y-%m-%d"), True) if power_details is not None: power = power_details['power'] # need to convert date and time value from UTC to local time before uploading to pvoutput.org power_datetime_local = utc_to_local( datetime.datetime.strptime(power_details['time'], "%Y-%m-%dT%H:%M:%SZ")) power_date = power_datetime_local.strftime("%Y%m%d") power_time = power_datetime_local.strftime("%H:%M") DEBUG('power == ' + str(power) + ' and power_date == ' + power_date) try: # Create connection to pvoutput.org pvout = PVoutput_Connection(pvo_key, pvo_system_id) # update pvoutput - but only if there is a value power value (i.e. > 0) # add_output() is for end of day output - therfore the peak power, pear time, etc # pvout.add_output(power_date, generated=power) if power > 0: pvout.add_status(power_date, power_time, power_exp=power) else: DEBUG('no need to update - power %dW' % power) except: print('An error with PVoutput ', sys.exc_info()[0])
def getPower(self, date_to_retrieve=None, most_recent_value=None): if date_to_retrieve is None: date_to_retrieve = datetime.date.today().strftime('%Y-%m-%d') if self.debug: DEBUG('today == ' + date_to_retrieve) DEBUG('Getting plant power (for a day)') # Get the power data for a specified date or today url = solarman_pv_api_base + '/plant/power' params = { 'plant_id': self.__plant_id, 'date': date_to_retrieve, 'timezone_id': 'Europe/Amsterdam' } try: response = requests.get(url, verify=self.__requests_verify, headers=self.__auth_headers, params=params) except requests.exceptions.ConnectionError as e: print '%s: connection failed - %s' % (self.__class__.__name__, e) return None response.encoding = 'utf-8' if self.debug: print response print response.url print response.encoding print response.text #print response.json() # validate response if 'data' not in response.json() and 'powers' not in response.json(): # should return None, maybe an exception print '%s: data or powers not in response: %s' % ( self.__class__.__name__, response.text) return None if most_recent_value is True: # Sort the json() (just to be sure), take the last value power_data = response.json()['data']['powers'] most_recent_power_data = None if isinstance(power_data, list): power_data.sort(key=self.__extractTime, reverse=True) # temporary exception handler to nut out but first thing in the morning try: most_recent_power_data = power_data[0] except IndexError: # Some error in response from the API, i.e. an empty list most_recent_power_data = None except: # Effectively an unhandled error - retaining this debug whilst in beta testing mode print '%s: Exception: getPower(): An error with power_data %s' % ( self.__class__.__name__, sys.exc_info()[0]) print str(power_data) print power_data else: # temporary debug - whilst in beta testing mode print str(power_data) return most_recent_power_data else: return response.json()
def __print_words(words, tag='__print_words'): for x in words: DEBUG('%s: %s length: %d' % (tag, x.text, x.length))
def __print_chunks(chunks, tag='__print_chunks'): for x in chunks: for y in x.words: DEBUG('%s: %s' % (tag, ' '.join([w.text for w in y])))
def __init__(self, dict_path=os.path.join(curdir, 'dict.txt')): DEBUG('Vocabulary loaded.') self.V = Vocabulary(dict_path=dict_path)