Exemplo n.º 1
0
def get_training_data_for_sense_cache(sense, lang):
  if '/' in sense or sense == "Bomb the Bass" or is_multi_cap(sense) or sense == "Kirklees" or sense == "Fallout 2":
    return []
  file_name = get_joined_name(['sense', lang, sense])
  sense_data = get_from_cache(file_name)
  if sense_data:
    debug("# Getting (cached) sense training data for " + sense)
    return sense_data

  debug("Getting sense training data for " + sense)
  
  output_file_name = 'output-' + sense + '-' + lang + '.xml'
  output = subprocess.check_output(['python', 'whatLinksHere.py', \
    sense, lang, str(NUM_WHATLINKS_PER_WORD)])
  
  sense_link_re = get_sense_link_re(sense)
  pos_entries_by_paragraph = []
  if os.path.exists(output_file_name):
    file_size = os.path.getsize(output_file_name)
    debug(file_size)
    pages = parse_wiki_xml.parse_articles_xml(output_file_name)
    ignored, pos_entries_by_paragraph = \
      get_annotated_paragraphs_in_pages(pages, lang, sense_link_re) 
  sense_data = pos_entries_by_paragraph
  if not is_likely_lower_sense(sense_data):
    sense_data = []
  insert_into_cache(file_name, sense_data)
  remove_if_exists(output_file_name)
  return sense_data
def getStopWords(LANGUAGE_CODE, NUM_ARTICLES, NUM_WORDS):
  # Get random ids from random api

  num_pages_scraped = 0
  allText = ""
  while (num_pages_scraped < NUM_ARTICLES):
    randUrl = "http://" + LANGUAGE_CODE + ".wikipedia.org/w/api.php?action=query&list=random&format=xml&rnlimit=10"
    pageids = []
    for pageid in parse_wiki_xml.parse_random_articles_xml(urllib.urlopen(randUrl)):
      pageids.append(str(pageid))

    num_pages_scraped += len (pageids)    

    pagesStr = "|".join(pageids)

    pagesStr = pagesStr[0:-1] # get rid of last pipe

    url = 'http://' + LANGUAGE_CODE + '.wikipedia.org/w/api.php?action=query&prop=revisions&redirects&rvprop=content&format=xml&pageids=' + pagesStr
    print "Scraping from url: " + url
  # print "Fetching from " + url

    articles = parse_wiki_xml.parse_articles_xml(urllib.urlopen(url))
    for article in articles:
      if 'content' in article and article['content']:
        allText += article['content']

  print "Length of all articles scraped: " + str(len(allText))
  cnt = collections.Counter()
  words = re.findall(r'\w+', allText.lower())
  mostFrequent = collections.Counter(words).most_common(NUM_WORDS)
  print mostFrequent
  output = ""
  for t in mostFrequent:
  	output += t[0] + "," + str(t[1]) + "\n"
  return output
Exemplo n.º 3
0
def wsd(xml_file_name, lang):
  global wsd_output_file
  stop_words = get_stop_words(lang)
  for page in parse_wiki_xml.parse_articles_xml(xml_file_name):
    output_file_base = os.path.join(lang + '2', page['pageid'] + '-' + lang)
    output_file_tmp = output_file_base + '.tmp'
    output_file_name = output_file_base + '.txt'
    debug("Disambiguating " + page['title'])
    if os.path.exists(output_file_name):
      continue
    wsd_output_file = open(output_file_tmp, 'w')
    wsd_page(page['pageid'], page['title'], page['content'], lang, stop_words)
    wsd_output_file.close()
    wsd_output_file = None
    os.rename(output_file_tmp, output_file_name)