def __init__(self, np2vec_model_file, binary=False, word_ngrams=False, grouping=False): """ Load the np2vec model for set expansion. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword ( ngrams) information. Returns: np2vec model to load """ self.grouping = grouping if grouping: # load grouping info logger.info('loading grouping data') self.id2rep = load_json_file(path.join(cur_dir, 'id2rep')) self.np2id = load_json_file(path.join(cur_dir, 'np2id')) self.id2group = load_json_file(path.join(cur_dir, 'id2group')) logger.info('loadind model...') self.np2vec_model = NP2vec.load(np2vec_model_file, binary=binary, word_ngrams=word_ngrams) # extract the first term of the model in order to get the marking character logger.info('compute L2 norm') first_term = next(iter(self.np2vec_model.vocab.keys())) self.mark_char = first_term[-1] # Precompute L2-normalized vectors. self.np2vec_model.init_sims() logger.info('done init')
def __init__( self, np2vec_model_file, binary=False, word_ngrams=False, grouping=False, light_grouping=False, grouping_map_dir=None, ): """ Load the np2vec model for set expansion. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword ( ngrams) information. light_grouping (bool): boolean indicating whether to load all maps for grouping. grouping_map_dir (str): path to the directory containing maps for grouping. Returns: np2vec model to load """ self.grouping = grouping if grouping: # load grouping info logger.info("loading grouping data") if not grouping_map_dir: grouping_map_dir = path.dirname(np2vec_model_file) self.np2id = load_json_file(path.join(grouping_map_dir, "np2id")) if not light_grouping: self.id2rep = load_json_file(path.join(grouping_map_dir, "id2rep")) self.id2group = load_json_file(path.join(grouping_map_dir, "id2group")) logger.info("loadind model...") self.np2vec_model = NP2vec.load(np2vec_model_file, binary=binary, word_ngrams=word_ngrams) # extract the first term of the model in order to get the marking character logger.info("compute L2 norm") first_term = next(iter(self.np2vec_model.vocab.keys())) self.mark_char = first_term[-1] # Precompute L2-normalized vectors. self.np2vec_model.init_sims() logger.info("done init")
arg_parser.add_argument( '--word_ngrams', default=0, type=int, choices=[0, 1], help= 'If 0, the model to load stores word information. If 1, the model to load stores ' 'subword (ngrams) information; note that subword information is relevant only to ' 'fasttext models.') arg_parser.add_argument( '--mark_char', default='_', type=str, action=check_size(1, 2), help='special character that marks word separator and NP suffix.') arg_parser.add_argument('--np', default='Intel Corp.', type=str, action=check_size(min_size=1), required=True, help='NP to print its word vector.') args = arg_parser.parse_args() np2vec_model = NP2vec.load(args.np2vec_model_file, binary=args.binary, word_ngrams=args.word_ngrams) print("word vector for the NP \'" + args.np + "\':", np2vec_model[args.mark_char.join(args.np.split()) + args.mark_char])
action='store_true') arg_parser.add_argument( '--word_ngrams', default=0, type=int, choices=[0, 1], help='If 0, the model to load stores word information. If 1, the model to load stores ' 'subword (ngrams) information; note that subword information is relevant only to ' 'fasttext models.') arg_parser.add_argument( '--mark_char', default='_', type=str, action=check_size(1, 2), help='special character that marks word separator and NP suffix.') arg_parser.add_argument( '--np', default='Intel Corp.', type=str, action=check_size(min=1), help='NP to print its word vector.') args = arg_parser.parse_args() np2vec_model = NP2vec.load( args.np2vec_model_file, binary=args.binary, word_ngrams=args.word_ngrams) print("word vector for the NP \'" + args.np + "\':", np2vec_model[args.mark_char.join( args.np.split()) + args.mark_char])