Пример #1
0
    def cut(self, sentence, punctuation=True):
        sentence_length = len(sentence)
        cursor = 0

        while cursor < sentence_length:
            if self.is_chinese_char(sentence[cursor]):
                DEBUG('begin pos 中文')
                chunks = self.__get_chunks(sentence,
                                           cursor)  # Matching Algorithm
                if len(chunks) > 0:
                    words, length = self.__ambiguity_resolution(
                        chunks)  # Ambiguity Resolution Rules
                    for term in list(filter(None, words)):
                        yield term
                    # Set the next begin pos
                    cursor += length
                else:
                    # for a single char in Chinese
                    yield sentence[cursor]
                    cursor += 1
            elif self.V.is_punctuation(sentence[cursor]) and punctuation:
                DEBUG("begin pos 字符")
                yield sentence[cursor]
                cursor += 1
            else:
                DEBUG('begin pos 非中文单词(英文单词, etc.)')
                word, cursor = self.__match_none_chinese_words(
                    sentence, cursor)
                yield word

            DEBUG("cursor %s" % cursor)
            DEBUG("char %s" % sentence[cursor - 1])
Пример #2
0
    def __match_none_chinese_words(sentence, begin_pos):
        '''
        切割出非中文词
        '''
        DEBUG('__match_none_chinese_words %s' % sentence[begin_pos])
        cursor = begin_pos

        # Skip pre-word whitespaces and punctuations
        # 跳过中英文标点和空格
        while cursor < len(sentence):
            ch = sentence[cursor]
            if Tokenizer.is_ascii_char(ch) or Tokenizer.is_chinese_char(ch):
                break
            cursor += 1
        # 得到英文单词的起始位置
        start = cursor

        # 找出英文单词的结束位置
        while cursor < len(sentence):
            ch = sentence[cursor]
            if not Tokenizer.is_ascii_char(ch):
                break
            cursor += 1
        end = cursor

        #Skip chinese word whitespaces and punctuations
        # 跳过中英文标点和空格
        while cursor < len(sentence):
            ch = sentence[cursor]
            if Tokenizer.is_ascii_char(ch) or Tokenizer.is_chinese_char(ch):
                break
            cursor += 1

        # 返回英文单词和游标地址
        return sentence[start:end], cursor
Пример #3
0
    def __ambiguity_resolution(self, chunks):
        '''
        根据当前游标位置进行切词
        '''
        DEBUG('__ambiguity_resolution 开始消岐')
        for x in chunks:
            [DEBUG(y.text) for y in x.words]
            DEBUG('-' * 20)
        if len(chunks) > 1:  # Rule 1: 根据 total_word_length 进行消岐
            DEBUG("# Rule 1: 根据 total_word_length 进行消岐")
            score = max([x.total_word_length for x in chunks])
            chunks = list(filter(None, \
                            [ x if x.total_word_length == score \
                                else None for x in chunks]))

        # for x in chunks: [DEBUG(y.text) for y in x.words]; DEBUG('-'*20)
        if len(chunks) > 1:  # Rule 2: 根据 average_word_length 进行消岐
            DEBUG("# Rule 2: 根据 average_word_length 进行消岐")
            score = max([x.average_word_length for x in chunks])
            chunks = list(filter(None, \
                            [ x if x.average_word_length == score \
                                else None for x in chunks]))

        if len(chunks) > 1:  # Rule 3: 根据 standard_deviation 进行消岐
            score = max([x.standard_deviation for x in chunks])
            chunks = list(filter(None, \
                            [ x if x.standard_deviation == score \
                                else None for x in chunks]))

        if len(chunks) > 1:  # Rule 4: 根据 word_frequency 进行消岐
            score = max([x.word_frequency for x in chunks])
            chunks = list(filter(None, \
                            [ x if x.word_frequency == score \
                                else None for x in chunks]))

        if len(chunks) != 1:
            '''
            分词失败
            '''
            DEBUG("分词失败")
            Tokenizer.__print_chunks(chunks)
            return '', 1

        words = chunks[0].words
        return [w.text for w in words], reduce(lambda x, y: x + y.length,
                                               words, 0)
Пример #4
0
smpv.setDebug(debug)
if smpv.connected is not True:
    print('An issue with connection to the SolarmanPV API')
    sys.exit(1)

power_details = smpv.getPower(datetime.date.today().strftime("%Y-%m-%d"), True)
if power_details is not None:
    power = power_details['power']
    # need to convert date and time value from UTC to local time before uploading to pvoutput.org
    power_datetime_local = utc_to_local(
        datetime.datetime.strptime(power_details['time'],
                                   "%Y-%m-%dT%H:%M:%SZ"))
    power_date = power_datetime_local.strftime("%Y%m%d")
    power_time = power_datetime_local.strftime("%H:%M")

    DEBUG('power == ' + str(power) + ' and power_date == ' + power_date)

    try:
        # Create connection to pvoutput.org
        pvout = PVoutput_Connection(pvo_key, pvo_system_id)
        # update pvoutput - but only if there is a value power value (i.e. > 0)

        # add_output() is for end of day output - therfore the peak power, pear time, etc
        #		pvout.add_output(power_date, generated=power)

        if power > 0:
            pvout.add_status(power_date, power_time, power_exp=power)
        else:
            DEBUG('no need to update - power %dW' % power)
    except:
        print('An error with PVoutput ', sys.exc_info()[0])
    def getPower(self, date_to_retrieve=None, most_recent_value=None):
        if date_to_retrieve is None:
            date_to_retrieve = datetime.date.today().strftime('%Y-%m-%d')

        if self.debug:
            DEBUG('today == ' + date_to_retrieve)
            DEBUG('Getting plant power (for a day)')

        # Get the power data for a specified date or today
        url = solarman_pv_api_base + '/plant/power'
        params = {
            'plant_id': self.__plant_id,
            'date': date_to_retrieve,
            'timezone_id': 'Europe/Amsterdam'
        }
        try:
            response = requests.get(url,
                                    verify=self.__requests_verify,
                                    headers=self.__auth_headers,
                                    params=params)
        except requests.exceptions.ConnectionError as e:
            print '%s: connection failed - %s' % (self.__class__.__name__, e)
            return None

        response.encoding = 'utf-8'

        if self.debug:
            print response
            print response.url
            print response.encoding
            print response.text
            #print response.json()

        # validate response
        if 'data' not in response.json() and 'powers' not in response.json():
            # should return None, maybe an exception
            print '%s: data or powers not in response: %s' % (
                self.__class__.__name__, response.text)
            return None

        if most_recent_value is True:
            # Sort the json() (just to be sure), take the last value
            power_data = response.json()['data']['powers']
            most_recent_power_data = None
            if isinstance(power_data, list):
                power_data.sort(key=self.__extractTime, reverse=True)
                # temporary exception handler to nut out but first thing in the morning
                try:
                    most_recent_power_data = power_data[0]
                except IndexError:
                    # Some error in response from the API, i.e. an empty list
                    most_recent_power_data = None
                except:
                    # Effectively an unhandled error - retaining this debug whilst in beta testing mode
                    print '%s: Exception: getPower(): An error with power_data %s' % (
                        self.__class__.__name__, sys.exc_info()[0])
                    print str(power_data)
                    print power_data
            else:
                # temporary debug - whilst in beta testing mode
                print str(power_data)
            return most_recent_power_data
        else:
            return response.json()
Пример #6
0
 def __print_words(words, tag='__print_words'):
     for x in words:
         DEBUG('%s: %s length: %d' % (tag, x.text, x.length))
Пример #7
0
 def __print_chunks(chunks, tag='__print_chunks'):
     for x in chunks:
         for y in x.words:
             DEBUG('%s: %s' % (tag, ' '.join([w.text for w in y])))
Пример #8
0
 def __init__(self, dict_path=os.path.join(curdir, 'dict.txt')):
     DEBUG('Vocabulary loaded.')
     self.V = Vocabulary(dict_path=dict_path)