Пример #1
0
 def testUnknownColumn(self):
     self.interceptLogs('otplc.colspec')
     colspec = [C._UNKNOWN, C.TOKEN]
     C.from_integers(colspec)
     self.test_log.assertMatches(
         u'ignoring _UNKNOWN column %s', args=(1,), levelname='INFO'
     )
Пример #2
0
def guess_colspec(otpl_reader):
    """
    Note that for guessing to work, the optionally present global enumeration
    column must be placed *before* the (also optional) local enumeration
    column.

    If the input file has a colspec header, that header is used instead of any
    guessing.

    :param otpl_reader: a reader instance
    :type otpl_reader: OtplReader
    :raises AttributeError: if the reader has an undefined separator property
    :returns: a :class:`ColumnSpecification` or ``None`` if the guessing fails
    """
    try:
        guess = _make_guess(otpl_reader)
    except (IOError, UnicodeDecodeError, DataFormatError) as e:
        L.warning(str(e))
        guess = []

    if isinstance(guess, Spec):
        L.info(u'from header: %s', str(guess))
        return guess
    elif len(guess) < 2:
        L.warning(u'failed for "%s"', otpl_reader.path)
        L.debug(u'discarded guess was: %s', Spec.to_string(guess))
        return None
    else:
        L.debug(u'as: %s', Spec.to_string(guess))
        return Spec.from_integers(guess)
Пример #3
0
 def testParseColspec(self):
     self.interceptLogs('otplc.colspec')
     # noinspection PyUnresolvedReferences
     names, values = zip(*C.NAMES.items())
     self.assertSequenceEqual(values, C.parse_colspec(' '.join(names)))
     self.test_log.assertMatches(
         u'using an internal colspec type; probably a Bad Idea', levelname='WARNING', count=2
     )
Пример #4
0
 def testInitialization(self):
     colspec = [
         C.SEGMENT_ID, C.GLOBAL_ENUM, C.LOCAL_ENUM, C.TOKEN,
         C.POS_TAG, C.LOCAL_REF, C.RELATION,
         C.ENTITY, C.GLOBAL_REF, C.GLOBAL_REF, C.EVENT, C.ATTRIBUTE, C.NORMALIZATION,
         C.LOCAL_REF, C.LOCAL_REF, C.EVENT,
     ]
     converter = C.from_integers(colspec)
     self.assertEqual(1, converter._global_enum)
     self.assertEqual(2, converter._local_enum)
     self.assertEqual(3, converter._token)
     self.assertEqual(4, converter._pos_tag)
     self.assertEqual({7, }, converter._entities)
     self.assertEqual({8: 7, 9: 7}, converter._global_refs)
     self.assertEqual({6: 4}, converter._relations)
     self.assertEqual({10: (8, (9,)), 15: (13, (14,))}, converter._events)
     self.assertEqual({12: 10}, converter._normalizations)  # important: norm of event!
     self.assertEqual({11: 10}, converter._attributes)
Пример #5
0
def _make_guess(segments):
    guess = None
    last_round = False

    for idx, segment in enumerate(segments):
        if not guess:
            # noinspection PyUnresolvedReferences
            if len(segment) == 1 and all(
                    n.split(u':')[0] in Spec.NAMES for n in segment[0]
            ):
                return Spec.from_string(' '.join(segment[0]))

            guess = Guess(segment)
        else:
            guess.update(segment)

        if idx > 4 or last_round:
            break
        elif guess.complete():
            last_round = True

    return guess.guess