def beginProcess(info, identifier, gdbm_files, filter_file, path, category):
    log = logging.getLogger('classify')
    log.debug("classify.beginProcess()")

    if identifier == "w":
        url_obj = url.url(gdbm_files, filter_file, path, category)
        url_obj.processHTML(info)
    elif identifier == "f":
        textfile_obj = textfile.textfile(gdbm_files, filter_file, path, category)
        textfile_obj.processUTFFile(info)
    elif identifier == "h":
        textfile_obj = textfile.textfile(gdbm_files, filter_file, path, category)
        textfile_obj.processHTMLFile(info)
    else:
        log.debug("identifier value is not valid")
        return

    log.debug("program terminated")
    return
Пример #2
0
 def import_Tonly(self, file):
     if ((file != None) and (file != '') and (file != ())):
         self.all_spans = namelistmap()
         self.all_rels = {}
         self.all_kstructs = {}
         with open(file, mode='r', encoding='utf-8') as f:
             file_content = f.read()
             file_ident = str(file)
             self.doc = document(ident=file_ident, content=file_content)
             self.data = self.doc.ctnt
             self.curr_file = file
             self.data_parse = textfile(in_text=self.data,
                                        configuration=self.config)
Пример #3
0
 def import_TandA(self, file):
     self.all_spans = namelistmap()
     self.all_rels = {}
     self.all_kstructs = {}
     with open(file, mode='r', encoding='utf-8') as f:
         file_content = f.read()
         doc_reg = re.compile('document\(.*\)', re.DOTALL | re.IGNORECASE)
         if doc_reg.match(file_content):
             self.doc = eval(file_content)
             self.data = self.doc.ctnt
             self.curr_file = file
             self.data_parse = textfile(in_text=self.data,
                                        configuration=self.config)
         else:
             print('ERROR tentative to import a text only file!')
Пример #4
0
    def __init__(self, data=None, config_path=None):

        self.config = {
        }  # a dictionary holding attribute value associations for configuring some annotation application, e.g. the treetagger: { 'TREEAGGER_DIR':'/people/koroleva/Desktop/src/TreeTagger'}
        if (config_path is not None):
            assert (type(config_path is str) and os.path.isfile(config_path))
            f = open(config_path, 'r')
            self.config = eval(f.read())
            f.close()

            # given to namespace, the list of various type names used to build object names (unique identifiers)
            Annotate.SPAN_TYP = 'SPAN'
            Annotate.MWU_TYP = 'MWU'
            Annotate.SRC_TYP = 'SRC'
            Annotate.TRGT_TYP = 'TRGT'
            Annotate.REL_TYP = 'REL'
            Annotate.CONSTRU_TYP = 'CONSTRU'
            Annotate.ALL_TYPS = [
                Annotate.SPAN_TYP, Annotate.MWU_TYP, Annotate.SRC_TYP,
                Annotate.TRGT_TYP, Annotate.REL_TYP, Annotate.CONSTRU_TYP
            ]
            self.name_mgr = namespace(Annotate.ALL_TYPS)

            self.curr_file = None

            self.data = ''
            self.wn = 0

            if (type(data) is document):
                self.doc = data
                self.data = self.doc.ctnt
            else:
                if (type(data) is str):
                    self.doc = document(ident='', content=data, metadata='')
                    self.data = data
                else:
                    assert (data is None)
                    self.doc = document(ident='', content='', metadata='')
                    self.data = data

            self.sentences = []

            self.data_parse = textfile(in_text=self.data,
                                       configuration=self.config)
Пример #5
0
  commonscore = compdict(text1, text2)

  return commonscore + modescore + lenscore + lensenscore

def compdict(text1, text2):
  wordst1 = set(map(lambda x: x[0], text1.sortedwords))
  wordst2 = set(map(lambda x: x[0], text2.sortedwords))
  temptotal = 0
  for word in (wordst1 & wordst2):
    diffword = text1.words[word] - text1.words[word]
    temptotal += -1*diffword*diffword + 10
  return (len(wordst1 & wordst2) * 50) + temptotal

files = []
for i in xrange(30):
  files.append(textfile(str(i+1) + ".txt"))

pairs = []

for j in xrange(30):
  pairlist = []
  try:
    for i in xrange(30):
      if i != j and not files[i].link and not files[j].link:
        pairlist.append(tuple([compscore(files[i], files[j]), i, j]))
    maxpair = max(pairlist, key=operator.itemgetter(0))
    files[maxpair[1]].link = True
    files[maxpair[2]].link = True
    pairs.append(maxpair[1:])
  except:
    continue