예제 #1
0
def get_words2vecs(doc, model, dim, stop_list, is_normalization_word=False, fix_stem=None):
  # flatten doc: doc[sentence_id][word_id] -> doc[word_id]
  if len(doc) > 0 and isinstance(doc[0], list):
    doc = sum(doc, [])

  if fix_stem:
    (refined_oov, stem_map, oov_map) = fix_stem
    doc_oov_refined = []
    for word in doc:
      if word not in model and word not in refined_oov:
        continue
      doc_oov_refined += (re.split('\s+', refined_oov[word]) if (word in refined_oov and bool(refined_oov[word])) else [word])

    doc = []
    for word in doc_oov_refined:
      stemmed_word = (' '.join(my_util.preprocess_content(word, stop_list, is_math=True, is_stemming=True))).strip()
      if stemmed_word:
        if stemmed_word in stem_map:
          doc.append(stem_map[stemmed_word])
        elif stemmed_word in oov_map:
          doc.append(oov_map[stemmed_word])

  arr = np.empty((0, (dim + len(oov_map.keys()) if fix_stem else dim)))
  for word in doc:
    if word in stop_list or (not fix_stem and word not in model):
      continue
    if not fix_stem:
      arr = np.vstack(( arr, normalize_sum(model[word], is_normalization_word) ))
    elif isinstance(word, (int, long)):
      word_arr = np.zeros( dim+len(oov_map.keys()) )
      word_arr[word] = 1.0
      arr = np.vstack((arr, word_arr))
    else:
      # consider normalize sum(model[word]) to 1
      arr = np.vstack((
        arr,
        np.hstack((
          np.zeros( len(oov_map.keys()) ),
          normalize_sum(model[word], is_normalization_word)
        ))
      ))
  return arr
def get_words2vecs(doc, model, dim, stop_list, is_normalization_word=False, fix_stem=None):
    # flatten doc: doc[sentence_id][word_id] -> doc[word_id]
    if len(doc) > 0 and isinstance(doc[0], list):
        doc = sum(doc, [])

    if fix_stem:
        (refined_oov, stem_map, oov_map) = fix_stem
        doc_oov_refined = []
        for word in doc:
            if word not in model and word not in refined_oov:
                continue
            doc_oov_refined += (
                re.split("\s+", refined_oov[word]) if (word in refined_oov and bool(refined_oov[word])) else [word]
            )

        doc = []
        for word in doc_oov_refined:
            stemmed_word = (
                " ".join(my_util.preprocess_content(word, stop_list, is_math=True, is_stemming=True))
            ).strip()
            if stemmed_word:
                if stemmed_word in stem_map:
                    doc.append(stem_map[stemmed_word])
                elif stemmed_word in oov_map:
                    doc.append(oov_map[stemmed_word])

    arr = np.empty((0, (dim + len(oov_map.keys()) if fix_stem else dim)))
    for word in doc:
        if word in stop_list or (not fix_stem and word not in model):
            continue
        if not fix_stem:
            arr = np.vstack((arr, normalize_sum(model[word], is_normalization_word)))
        elif isinstance(word, (int, long)):
            word_arr = np.zeros(dim + len(oov_map.keys()))
            word_arr[word] = 1.0
            arr = np.vstack((arr, word_arr))
        else:
            # consider normalize sum(model[word]) to 1
            arr = np.vstack(
                (arr, np.hstack((np.zeros(len(oov_map.keys())), normalize_sum(model[word], is_normalization_word))))
            )
    return arr
예제 #3
0
def _main( ):
  argv_index = 1
  task_type, argv_index = assign_argv(sys.argv, argv_index) # 1
  corpus_name, argv_index = assign_argv(sys.argv, argv_index) # 2
  model_name, argv_index = assign_argv(sys.argv, argv_index) # 3
  lecture_list_name, argv_index = assign_argv(sys.argv, argv_index) # 4
  fea_dir, argv_index = assign_argv(sys.argv, argv_index) # 5
  dim, argv_index = assign_argv(sys.argv, argv_index, int) # 6

  is_normalization_doc2vec, argv_index = assign_argv(sys.argv, argv_index, lambda x: bool(int(x))) # 7
  is_normalization_cos_sim, argv_index = assign_argv(sys.argv, argv_index) # 8
  is_remove_stop, argv_index = assign_argv(sys.argv, argv_index, lambda x: bool(int(x))) # 9
  corpus_pos_name, argv_index = assign_argv(sys.argv, argv_index) # 10
  pos_type, argv_index = assign_argv(sys.argv, argv_index) # 11

  use_external_model = False
  fix_stem = False
  fix_math = False
  keyword_only = False
  is_normalization_word = False
  fix_stem_info = None
  is_pool = False
  if 'pool' in task_type:
    is_pool = True

  if re.match('.*external_model', task_type):
    model = gensim.models.Word2Vec.load_word2vec_format(model_name, binary=True)
    use_external_model = True
    if 'fix_stem' in task_type:
      fix_stem = True

      #if 'pool' in task_type:
      #  is_pool = True

      if 'fix_stem_math' in task_type:
        fix_math = True
      elif 'fix_stem_keyword' in task_type:
        keyword_only = True
  else:
    model = gensim.models.Word2Vec.load(model_name)

  stop_list = []
  if is_remove_stop or fix_stem:
    stop_list = load_courseware.load_stop_list()

  if re.match('.*textbook_lecture', task_type):
    tx_list_name, argv_index = assign_argv(sys.argv, argv_index) # 12
    seg_dir, argv_index = assign_argv(sys.argv, argv_index) # 13
  elif re.match('.*slides_trans', task_type):
    window_size, argv_index = assign_argv(sys.argv, argv_index, int) # 12

  if re.match('.*textbook_lecture', task_type) or re.match('.*slides_trans', task_type):
    if use_external_model:
      if fix_stem:
        is_normalization_word, argv_index = assign_argv(sys.argv, argv_index, lambda x: bool(int(x))) # textbook=14/slides=13
        oov_refined, argv_index = assign_argv(sys.argv, argv_index) # t=15/s=14
        stem_to_raw, argv_index = assign_argv(sys.argv, argv_index) # t=16/s=15

        if fix_math:
          math_words, argv_index = assign_argv(sys.argv, argv_index) # t=17/s=16 ? it depends
        elif keyword_only:
          keyword_list, argv_index = assign_argv(sys.argv, argv_index) # t=17/s=16 ? it depends
      else:
        oov_dict_name, argv_index = assign_argv(sys.argv, argv_index) # t=14/s=13

  if use_external_model:
    if fix_stem:
      refined_oov = {}
      keywords = []
      if keyword_only:
        for line in open(keyword_list):
          keywords += my_util.preprocess_content(line.strip(), stop_list, is_math=True, is_stemming=True)
      keywords = list(set(keywords))

      for line in open(oov_refined):
        items = line.strip().split('\t')
        refined_oov[items[0]] = (items[1] if len(items) > 1 else '')

      stem_map = {}
      oov_map = {}
      oov_size = 0
      for line in open(stem_to_raw):
        items = line.strip().split('\t')
        # remove words not in positive list (keyword list)
        if keywords and items[0] not in keywords:
          continue
        if re.match('n', items[2]):
          stem_map[items[0]] = re.split('\s+', items[1])[0]
        else:
          oov_map[items[0]] = oov_size
          oov_size += 1
      # add math words to oov
      if fix_math:
        for line in open(math_words):
          stemmed_word = (' '.join(my_util.preprocess_content(line.strip(), stop_list, is_math=True, is_stemming=True))).strip()
          # remove words not in positive list (keyword list)
          if stemmed_word and keywords and stemmed_word not in keywords:
            continue
          if stemmed_word and stemmed_word not in (stem_map.keys()+oov_map.keys()):
            oov_map[stemmed_word] = oov_size
            oov_size += 1
      fix_stem_info = (refined_oov, stem_map, oov_map)
    else:
      oov_dict = my_util.load_stopwords(oov_dict_name)

  temp_fea_dir = fea_dir
  for pool_param in gen_pool_param(is_pool):
    if is_pool:
      fea_dir = temp_fea_dir + '_%s_%d_%d' % pool_param
      if not os.path.exists(fea_dir):
        os.makedirs(fea_dir)

    corpus = load_corpus(corpus_name, pool_param=pool_param)
    corpus_pos = load_corpus(corpus_pos_name, pool_param=pool_param)

    for l_id in open(lecture_list_name):
      l_id = l_id.strip()
      trans_temp = get_docs(trans_doc_prefix, l_id, corpus, corpus_pos, pos_type)
      if re.match('.*slides_trans', task_type):
        trans = windowing(trans_temp, window_size)
        targets = get_docs(slides_doc_prefix, l_id, corpus, corpus_pos, pos_type)
      elif re.match('.*textbook_lecture', task_type):
        trans = window_by_seg(trans_temp, '%s/%s' % (seg_dir, l_id), pool_param=pool_param)
        targets = get_tx_docs(corpus, corpus_pos, pos_type, tx_list_name)

      print_fea('%s/%s' % (fea_dir, l_id), trans, targets, model, dim, stop_list, is_normalization_doc2vec, is_normalization_cos_sim, is_normalization_word=is_normalization_word, fix_stem=fix_stem_info, pool_param=pool_param)
    
      if use_external_model and not fix_stem:
        print_fea('%s/%s.oov' % (fea_dir, l_id), trans, targets, model, dim, stop_list, is_normalization_doc2vec, is_normalization_cos_sim, oov_dict=oov_dict)
def _main():
    argv_index = 1
    task_type, argv_index = assign_argv(sys.argv, argv_index)  # 1
    corpus_name, argv_index = assign_argv(sys.argv, argv_index)  # 2
    model_name, argv_index = assign_argv(sys.argv, argv_index)  # 3
    lecture_list_name, argv_index = assign_argv(sys.argv, argv_index)  # 4
    fea_dir, argv_index = assign_argv(sys.argv, argv_index)  # 5
    dim, argv_index = assign_argv(sys.argv, argv_index, int)  # 6

    is_normalization_doc2vec, argv_index = assign_argv(sys.argv, argv_index, lambda x: bool(int(x)))  # 7
    is_normalization_cos_sim, argv_index = assign_argv(sys.argv, argv_index)  # 8
    is_remove_stop, argv_index = assign_argv(sys.argv, argv_index, lambda x: bool(int(x)))  # 9
    corpus_pos_name, argv_index = assign_argv(sys.argv, argv_index)  # 10
    pos_type, argv_index = assign_argv(sys.argv, argv_index)  # 11

    use_external_model = False
    fix_stem = False
    fix_math = False
    keyword_only = False
    is_normalization_word = False
    fix_stem_info = None
    is_pool = False
    if "pool" in task_type:
        is_pool = True

    if re.match(".*external_model", task_type):
        model = gensim.models.Word2Vec.load_word2vec_format(model_name, binary=True)
        use_external_model = True
        if "fix_stem" in task_type:
            fix_stem = True

            # if 'pool' in task_type:
            #  is_pool = True

            if "fix_stem_math" in task_type:
                fix_math = True
            elif "fix_stem_keyword" in task_type:
                keyword_only = True
    else:
        model = gensim.models.Word2Vec.load(model_name)

    stop_list = []
    if is_remove_stop or fix_stem:
        stop_list = load_courseware.load_stop_list()

    if re.match(".*textbook_lecture", task_type):
        tx_list_name, argv_index = assign_argv(sys.argv, argv_index)  # 12
        seg_dir, argv_index = assign_argv(sys.argv, argv_index)  # 13
    elif re.match(".*slides_trans", task_type):
        window_size, argv_index = assign_argv(sys.argv, argv_index, int)  # 12

    if re.match(".*textbook_lecture", task_type) or re.match(".*slides_trans", task_type):
        if use_external_model:
            if fix_stem:
                is_normalization_word, argv_index = assign_argv(
                    sys.argv, argv_index, lambda x: bool(int(x))
                )  # textbook=14/slides=13
                oov_refined, argv_index = assign_argv(sys.argv, argv_index)  # t=15/s=14
                stem_to_raw, argv_index = assign_argv(sys.argv, argv_index)  # t=16/s=15

                if fix_math:
                    math_words, argv_index = assign_argv(sys.argv, argv_index)  # t=17/s=16 ? it depends
                elif keyword_only:
                    keyword_list, argv_index = assign_argv(sys.argv, argv_index)  # t=17/s=16 ? it depends
            else:
                oov_dict_name, argv_index = assign_argv(sys.argv, argv_index)  # t=14/s=13

    if use_external_model:
        if fix_stem:
            refined_oov = {}
            keywords = []
            if keyword_only:
                for line in open(keyword_list):
                    keywords += my_util.preprocess_content(line.strip(), stop_list, is_math=True, is_stemming=True)
            keywords = list(set(keywords))

            for line in open(oov_refined):
                items = line.strip().split("\t")
                refined_oov[items[0]] = items[1] if len(items) > 1 else ""

            stem_map = {}
            oov_map = {}
            oov_size = 0
            for line in open(stem_to_raw):
                items = line.strip().split("\t")
                # remove words not in positive list (keyword list)
                if keywords and items[0] not in keywords:
                    continue
                if re.match("n", items[2]):
                    stem_map[items[0]] = re.split("\s+", items[1])[0]
                else:
                    oov_map[items[0]] = oov_size
                    oov_size += 1
            # add math words to oov
            if fix_math:
                for line in open(math_words):
                    stemmed_word = (
                        " ".join(my_util.preprocess_content(line.strip(), stop_list, is_math=True, is_stemming=True))
                    ).strip()
                    # remove words not in positive list (keyword list)
                    if stemmed_word and keywords and stemmed_word not in keywords:
                        continue
                    if stemmed_word and stemmed_word not in (stem_map.keys() + oov_map.keys()):
                        oov_map[stemmed_word] = oov_size
                        oov_size += 1
            fix_stem_info = (refined_oov, stem_map, oov_map)
        else:
            oov_dict = my_util.load_stopwords(oov_dict_name)

    temp_fea_dir = fea_dir
    for pool_param in gen_pool_param(is_pool):
        if is_pool:
            fea_dir = temp_fea_dir + "_%s_%d_%d" % pool_param
            if not os.path.exists(fea_dir):
                os.makedirs(fea_dir)

        corpus = load_corpus(corpus_name, pool_param=pool_param)
        corpus_pos = load_corpus(corpus_pos_name, pool_param=pool_param)

        for l_id in open(lecture_list_name):
            l_id = l_id.strip()
            trans_temp = get_docs(trans_doc_prefix, l_id, corpus, corpus_pos, pos_type)
            if re.match(".*slides_trans", task_type):
                trans = windowing(trans_temp, window_size)
                targets = get_docs(slides_doc_prefix, l_id, corpus, corpus_pos, pos_type)
            elif re.match(".*textbook_lecture", task_type):
                trans = window_by_seg(trans_temp, "%s/%s" % (seg_dir, l_id), pool_param=pool_param)
                targets = get_tx_docs(corpus, corpus_pos, pos_type, tx_list_name)

            print_fea(
                "%s/%s" % (fea_dir, l_id),
                trans,
                targets,
                model,
                dim,
                stop_list,
                is_normalization_doc2vec,
                is_normalization_cos_sim,
                is_normalization_word=is_normalization_word,
                fix_stem=fix_stem_info,
                pool_param=pool_param,
            )

            if use_external_model and not fix_stem:
                print_fea(
                    "%s/%s.oov" % (fea_dir, l_id),
                    trans,
                    targets,
                    model,
                    dim,
                    stop_list,
                    is_normalization_doc2vec,
                    is_normalization_cos_sim,
                    oov_dict=oov_dict,
                )