Пример #1
0
    def __init__(self,
                 np2vec_model_file,
                 binary=False,
                 word_ngrams=False,
                 grouping=False):
        """
        Load the np2vec model for set expansion.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword (
            ngrams) information.

        Returns:
            np2vec model to load
        """
        self.grouping = grouping
        if grouping:
            # load grouping info
            logger.info('loading grouping data')
            self.id2rep = load_json_file(path.join(cur_dir, 'id2rep'))
            self.np2id = load_json_file(path.join(cur_dir, 'np2id'))
            self.id2group = load_json_file(path.join(cur_dir, 'id2group'))
        logger.info('loadind model...')
        self.np2vec_model = NP2vec.load(np2vec_model_file,
                                        binary=binary,
                                        word_ngrams=word_ngrams)
        # extract the first term of the model in order to get the marking character
        logger.info('compute L2 norm')
        first_term = next(iter(self.np2vec_model.vocab.keys()))
        self.mark_char = first_term[-1]
        # Precompute L2-normalized vectors.
        self.np2vec_model.init_sims()
        logger.info('done init')
Пример #2
0
    def __init__(
        self,
        np2vec_model_file,
        binary=False,
        word_ngrams=False,
        grouping=False,
        light_grouping=False,
        grouping_map_dir=None,
    ):
        """
        Load the np2vec model for set expansion.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword (
            ngrams) information.
            light_grouping (bool): boolean indicating whether to load all maps for grouping.
            grouping_map_dir (str): path to the directory containing maps for grouping.
        Returns:
            np2vec model to load
        """
        self.grouping = grouping
        if grouping:
            # load grouping info
            logger.info("loading grouping data")
            if not grouping_map_dir:
                grouping_map_dir = path.dirname(np2vec_model_file)
            self.np2id = load_json_file(path.join(grouping_map_dir, "np2id"))
            if not light_grouping:
                self.id2rep = load_json_file(path.join(grouping_map_dir, "id2rep"))
                self.id2group = load_json_file(path.join(grouping_map_dir, "id2group"))
        logger.info("loadind model...")
        self.np2vec_model = NP2vec.load(np2vec_model_file, binary=binary, word_ngrams=word_ngrams)
        # extract the first term of the model in order to get the marking character
        logger.info("compute L2 norm")
        first_term = next(iter(self.np2vec_model.vocab.keys()))
        self.mark_char = first_term[-1]
        # Precompute L2-normalized vectors.
        self.np2vec_model.init_sims()
        logger.info("done init")
Пример #3
0
    arg_parser.add_argument(
        '--word_ngrams',
        default=0,
        type=int,
        choices=[0, 1],
        help=
        'If 0, the model to load stores word information. If 1, the model to load stores '
        'subword (ngrams) information; note that subword information is relevant only to '
        'fasttext models.')
    arg_parser.add_argument(
        '--mark_char',
        default='_',
        type=str,
        action=check_size(1, 2),
        help='special character that marks word separator and NP suffix.')
    arg_parser.add_argument('--np',
                            default='Intel Corp.',
                            type=str,
                            action=check_size(min_size=1),
                            required=True,
                            help='NP to print its word vector.')

    args = arg_parser.parse_args()

    np2vec_model = NP2vec.load(args.np2vec_model_file,
                               binary=args.binary,
                               word_ngrams=args.word_ngrams)

    print("word vector for the NP \'" + args.np + "\':",
          np2vec_model[args.mark_char.join(args.np.split()) + args.mark_char])
Пример #4
0
        action='store_true')
    arg_parser.add_argument(
        '--word_ngrams',
        default=0,
        type=int,
        choices=[0, 1],
        help='If 0, the model to load stores word information. If 1, the model to load stores '
        'subword (ngrams) information; note that subword information is relevant only to '
        'fasttext models.')
    arg_parser.add_argument(
        '--mark_char',
        default='_',
        type=str,
        action=check_size(1, 2),
        help='special character that marks word separator and NP suffix.')
    arg_parser.add_argument(
        '--np',
        default='Intel Corp.',
        type=str,
        action=check_size(min=1),
        help='NP to print its word vector.')

    args = arg_parser.parse_args()

    np2vec_model = NP2vec.load(
        args.np2vec_model_file,
        binary=args.binary,
        word_ngrams=args.word_ngrams)

    print("word vector for the NP \'" + args.np + "\':", np2vec_model[args.mark_char.join(
        args.np.split()) + args.mark_char])