def __parse_documents(datapath):
    """ function: parse_document
        ------------------------
        extract list of Document objects from token list

        :returns: list of document entities generated by generate_document()
    """
    documents = []
    pairs = dict([])
    # generate well-formatted document set for each file
    for file in os.listdir(datapath):
        # open 'reut2-XXX.sgm' file from /data directory
        path = os.path.join(datapath, file)
        data = open(path, 'r')
        text = data.read()
        data.close()
        tree = __generate_tree(text.lower())
        # separate segments & generate documents
        for reuter in tree.find_all("reuters"):
            document = Document(reuter)
            pairs[document] = reuter
        # generate tokenized word list for each document
        for document, reuter in pairs.iteritems():
            document.populate_word_list(reuter)
            documents.append(document)
        print "Finished extracting information from file:", file
    return documents
Exemplo n.º 2
0
def __parse_documents(datapath):
    """ function: parse_document
        ------------------------
        extract list of Document objects from token list

        :returns: list of document entities generated by generate_document()
    """
    documents = []
    pairs = dict([])
    # generate well-formatted document set for each file
    for file in os.listdir(datapath):
        # open 'reut2-XXX.sgm' file from /data directory
        path = os.path.join(datapath, file)
        data = open(path, 'r')
        text = data.read()
        data.close()
        tree = __generate_tree(text.lower())
        # separate segments & generate documents
        for reuter in tree.find_all("reuters"):
            document = Document(reuter)
            pairs[document] = reuter
        # generate tokenized word list for each document
        for document, reuter in pairs.iteritems():
            document.populate_word_list(reuter)
            documents.append(document)
        print "Finished extracting information from file:", file
    return documents
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(description='Crée le dossier d\'un document à partir d\'un fichier bibtex')
    parser.add_argument('bib', help='le fichier bibtex du document')
    parser.add_argument("-", "--path", type=str, help="chemin du dossier où créer le dossier du document")
    args = parser.parse_args()

    bibfile = args.bib
    path = args.path

    doc = Document()
    if(doc.readFields(bibfile)):
        doc.createDocumentFolder(bibfile, path)
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(description='Crée un template de notes latex à partir d\'un fichier bibtex')
    parser.add_argument('bib', help='le fichier bibtex du document')
    parser.add_argument("-o", "--path", type=str, help="chemin du dossier où créer le dossier du document")
    args = parser.parse_args()

    bibfile = args.bib
    path = args.path

    doc = Document()
    if(doc.readFields(bibfile)):
        doc.writeLatexNotes(path)
Exemplo n.º 5
0
    def parse(self, path):
        """Méthode qui analyse un fichier .txt est renvoie un corpus

        Parameters
        ----------
        path : str
            Chemin du fichier à analyser

        Raises
        ------
        FileNotFoundError
            Si le chemin vers le fichier n'existe pas
        PermissionError
            Si les permissions du fichier ne permettent pas l'ouverture
        """
        corpusRes = Corpus()
        with open(path, "r", encoding="utf-8") as file:
            txt = file.read()

        regex = r'<article title=\".*?\">\n(.*?)</article>'
        contenus = re.findall(regex, txt, re.DOTALL)

        for content in contenus:
            corpusRes.addDocument(Document(content))

        return corpusRes
Exemplo n.º 6
0
    def parse(self, path):
        """Methode qui permet de construire un corpus à partir d'un .txt.
        Ici le motif "##END##" doit être une ligne entre deux documents dans le
        .txt. Si aucun motif "##END##" n'est présent alors le corpus sera composé
        d'un seul document de contenu tout le .txt.

        Parameters
        ----------
        path : str
            Chemin du fichier à analyser

        Raises
        ------
        FileNotFoundError
            Si le chemin vers le fichier n'existe pas
        PermissionError
            Si les permissions du fichier ne permettent pas l'ouverture

        Returns
        -------
        Corpus
            Le corpus extrait du fichier passé en argument
        """
        corpusRes = Corpus()

        with open(path, 'r', encoding='utf-8') as f:
            txt = f.read()

        listedoc = re.split('^##END##$', txt, flags=re.MULTILINE)

        for contenu in listedoc:
            corpusRes.addDocument(Document(contenu))

        return corpusRes
Exemplo n.º 7
0
def get_candidate_fragments_from_web(fragments: List[Fragment],
                                     **params) -> List[FragmentReport]:
    fragments_url_snippet_pairs = get_url_snippet_pairs(fragments, params)
    # create reports
    reports = []
    url_doc = {}
    user_login, _ = get_current_user()
    for i in range(len(fragments)):
        similar_fragments = []
        for url, snippet in fragments_url_snippet_pairs[i]:
            if url not in url_doc.keys():
                doc = Document(id=None,
                               uri=url,
                               snippet=snippet,
                               date_added=datetime.datetime.now(),
                               user_login=user_login)
                url_doc[url] = doc
            web_doc_fragments = url_doc[url].get_fragments()
            similar_fragments.extend([(wb_fragment, 1.0)
                                      for page in web_doc_fragments
                                      for wb_fragment in page])
        reports.append(
            FragmentReport(checked_fragment=fragments[i],
                           most_similar=similar_fragments))
    return reports
Exemplo n.º 8
0
 def test_uploading_content_to_index(self):
     # Search for term, add similar content and check that hits increase
     doc = Document("Test project", "Test customer", "test_file")
     hits = len(docfuncs.search("test", 999999999999999999))
     docfuncs.upload_to_index(
         doc, "src/tests/test_upload_files/one_page_test_file.pdf")
     self.assertEqual(len(docfuncs.search("test", 99999999999999999)),
                      hits + 1)
Exemplo n.º 9
0
def get_all_documents_from_db():
    """

    Returns: All document objects from the database

    """
    conn = db_conn.get_database_connection()
    cur = conn.cursor()
    cur.execute("SELECT * FROM documents")
    doc_records = cur.fetchall()
    doc_list = []
    for doc in doc_records:
        document = Document(doc[1], doc[2], doc[3])
        document.set_doc_id(doc[0])
        doc_list.append(document)
    cur.close()
    return doc_list
Exemplo n.º 10
0
def get_document_from_db(doc_id):
    """

    Args:
        doc_id: Document id to be retreived from the database

    Returns: Document object given the id

    """
    conn = db_conn.get_database_connection()
    cur = conn.cursor()
    cur.execute("SELECT * FROM documents WHERE id=?", (doc_id, ))
    doc_query_object = cur.fetchall()[0]
    # Construct Document object with params: project, customer, file
    doc = Document(doc_query_object[1], doc_query_object[2],
                   doc_query_object[3])
    doc.set_doc_id(doc_query_object[0])
    cur.close()
    return doc
Exemplo n.º 11
0
 def test_upload_metadata_to_db(self):
     # Test should return id 3 when new document is inserted
     # At last the inserted row is removed
     doc = Document("Test project", "Test customer", "test_file.xyz")
     self.assertEqual(docfuncs.upload_document_to_db(doc), 3)
     conn = database_connect.get_database_connection()
     cur = conn.cursor()
     sql = '''DELETE FROM documents WHERE id = 3 '''
     cur.execute(sql)
     conn.commit()
     cur.close()
Exemplo n.º 12
0
def upload_similar_docs_from_web(fragments: List[Fragment], **params):
    fragments_url_snippet_pairs = get_url_snippet_pairs(fragments, params)

    for url, snippet in set(
            itertools.chain.from_iterable(fragments_url_snippet_pairs)):
        doc = Document(id=None,
                       uri=url,
                       snippet=snippet,
                       date_added=datetime.datetime.now(),
                       user_login=params["user_login"])
        Documents.add_document(doc)
Exemplo n.º 13
0
def function_to_debug():
    doc_a = Document("DocA", [
        Feature("f0", 10),
        Feature("f1", 5),
        Feature("f2", 10),
        Feature("f3", 20),
        Feature("f4", 10),
        Feature("f5", 10),
        Feature("f6", 10)
    ])
    doc_b = Document("DocB", [
        Feature("f0", 10),
        Feature("f3", 5),
        Feature("f5", 8),
        Feature("f6", 1)
    ])

    m = Match(doc_a, doc_b)
    m.elaborated_match_v2()

    sys.exit(0)
    def _pass1(self, thread_no, queue):
        while True:
            logging.debug("INDEXER-P1-THREAD-%d: Waiting for next document" %
                          thread_no)
            document_path = queue.get()  # blocks until a document is avialable

            document = Document.from_file(document_path)
            if document is None:
                logging.warning("INDEXER-P1-THREAD-%d: Document %s contains "
                                "invalid format" % (thread_no, document_path))
                queue.task_done()
                continue
            logging.debug("INDEXER-P1-THREAD-%d: Processing '%s'" %
                          (thread_no, document.title))

            # Assign an ID to the document
            with self._ifile_lock:
                document.document_id = int(document.document_number)
                self._corpus[document.document_id] = document

                # For similar context index
                if (self._full_text is not None):
                    cleaned_text = []
                    for word in document.text.split(" "):
                        if not word.lower() in constants.DO_NOT_INDEX:
                            cleaned_text.append(word)
                    self._full_text = self._full_text + " " + " ".join(
                        cleaned_text)

            # Tokenize
            tokens = re.compile(constants.DELIMITERS).split(document.text)
            tokens_title = re.compile(constants.DELIMITERS).split(
                document.title)
            tokens_author = re.compile(constants.DELIMITERS).split(
                document.author)
            tokens_biblio = re.compile(constants.DELIMITERS).split(
                document.biblio)

            # Insert tokens in inverted files
            for token in tokens:
                self._pass1_process_token(document.document_id, token)

            for token in tokens_title:
                self._pass1_process_token(document.document_id, token)

            for token in tokens_author:
                self._pass1_process_token(document.document_id, token)

            for token in tokens_biblio:
                self._pass1_process_token(document.document_id, token)

            queue.task_done()
    def _pass1(self, thread_no, queue):
        while True:
            logging.debug("INDEXER-P1-THREAD-%d: Waiting for next document" %
                          thread_no)
            document_path = queue.get() # blocks until a document is avialable
            

            document = Document.from_file(document_path)
            if document is None:
                logging.warning("INDEXER-P1-THREAD-%d: Document %s contains "
                                "invalid format" % (thread_no,document_path))
                queue.task_done()
                continue
            logging.debug("INDEXER-P1-THREAD-%d: Processing '%s'" % 
                            (thread_no,document.title))

            # Assign an ID to the document
            with self._ifile_lock:
                document.document_id = int(document.document_number)
                self._corpus[document.document_id] = document

                # For similar context index
                if (self._full_text is not None):
                    cleaned_text = []
                    for word in document.text.split(" "):
                        if not word.lower() in constants.DO_NOT_INDEX:
                            cleaned_text.append(word)
                    self._full_text = self._full_text + " " + " ".join(cleaned_text)


            # Tokenize
            tokens = re.compile(constants.DELIMITERS).split(document.text)
            tokens_title = re.compile(constants.DELIMITERS).split(document.title)
            tokens_author = re.compile(constants.DELIMITERS).split(document.author)
            tokens_biblio = re.compile(constants.DELIMITERS).split(document.biblio)


            # Insert tokens in inverted files
            for token in tokens:                
                self._pass1_process_token(document.document_id, token)

            for token in tokens_title:                
                self._pass1_process_token(document.document_id, token)

            for token in tokens_author:
                self._pass1_process_token(document.document_id, token)

            for token in tokens_biblio:
                self._pass1_process_token(document.document_id, token)

            queue.task_done()
Exemplo n.º 16
0
def save_file(project, customer, file, long_file_name):
    """ Processes the server functions of the file upload request.

    Args:
        document: Document object to be added to the database and Whoosh index
        long_file_name: OS path to the file to be stored to server and extracted
        added to search index.

    Returns:

    """

    document = Document(project, customer, file)

    document.document_id = upload_document_to_db(document)
    save_path = DOCUMENT_FILEPATH
    filename = os.path.join(save_path, str(document.document_id))

    newfile = open(filename, "w+b")
    newfile.write(open(long_file_name, "r+b").read())
    newfile.close()

    upload_to_index(document, long_file_name)
Exemplo n.º 17
0
	docs = set()
	dictStatDoc[q] = {}
	dictStatFeat[q] = {}

	list = collection.find({'query':q},{'_id':0,'docs':1})
	count = 0
	for i in list :
		for d in i['docs'] :
			#print "**********"
			count += 1
			name = d['doc_name']
			#list_feat = []
			list_feat ={}
			for f in d['features'] :
				list_feat[f] = Feature(f,d['features'][f])	
			list_doc[name] = Document(name,list_feat)

	
	for p in dictQRels[q]['pertinent'] :
		if p in list_doc :
			rankedFeat_p = sorted(list_doc[p].features.values(), key=attrgetter('value'), reverse=True)
			for np in dictQRels[q]['nonpertinent'] :
				if np in list_doc :
					rankedFeat_np = sorted(list_doc[np].features.values(), key=attrgetter('value'), reverse=True)
					val = compareFeatures(rankedFeat_p,rankedFeat_np,k)
					if val == 1 :
						nb_p_kdomine += 1

					nb_match_total += 1

Exemplo n.º 18
0
    model_path="data/models/sentencepiece/",
    train_text_files=dmgr.builder.list_source_files(
        dmgrs=['TEXT_BOOK', 'TEXT_BOOK_LOW', 'TEXT_WEB']),
    sample_count=
    30000000,  # Total sentence count in my corpus is about 80M(80000000)
    config=config,
)

# Dataset build (using modules)
print("Building dataset")
dmgr.builder.build_all([
    "TEXT_BOOK", "TEXT_BOOK_LOW", "TEXT_WEB", "TEXT_WIKI", "TEXT_NAMUWIKI",
    "TEXT_NEWS_COMMENT"
], config)
dmgr.builder.build_all(["PARALLEL_KO_KO"], config)
dmgr.builder.build_all(["TEXT_BERT"], config)
dmgr.builder.build_all(["NER"], config)

# Module train / load
config = embedder.bert.initialize(model_path="data/models/bert",
                                  train_dataset="TEXT_BERT",
                                  config=config)

config = ner.bert_ner.initialize(model_path="data/models/bert_ner",
                                 train_dataset="NER",
                                 config=config)  # depends on bert module

# Generate report
doc = Document("data/datasets/TEXT_BOOK.json", config=config)
reporter.report_to_file(doc)
Exemplo n.º 19
0
 def createFolderFromBibfile(self):
     doc = Document()
     doc.readFields("article.bib")
     doc.createDocumentFolder()
Exemplo n.º 20
0
import directoryparser as dir
import drinks_parser as dp
from document.document import Document
from document.constants import *
from csv import CSV_Writer
drink_directory = "../data/drinks/"
pub_directory = "../pub/"
print_dict = dict()
docs = []

if __name__ == '__main__':
    for drink_file in dir.get_drink_files(drink_directory):
        docs.append(dp.get_drink_dict(drink_file))

    # To make headlines country->brewery->drinks->review where
    # country, brewery, drink is uniqe:
    # {"country": {"brewery":[{drink_name:doc}]}}
    print_dict = dp.order_drinks(docs)

    main_file = Document(pub_directory, js_header)
    main_file.create_content(print_dict)
    main_file.save_basic_file("provningar_main.org")

    one_file = Document(pub_directory, header)
    one_file.create_content(print_dict)
    one_file.save_basic_file("provningar_en_fil.org")

    csv_writer = CSV_Writer(pub_directory, docs)
    csv_writer.write_lines("csv_file.csv")
Exemplo n.º 21
0
def main():
    # Handle user options
    try:
        opts, args = getopt.getopt(
            sys.argv[1:], "dht:i:l:s:n:r:f:vc:b:g:am:o:p:x:w:y:z:", [
                "debug", "help", "type=", "impact=", "life=", "strategy=",
                "nbFeats=", "rounds=", "featureList=", "verbose",
                'collection=', 'group=', 'accepted', 'model=', 'optim',
                'process=', "cross=", "boost=", "alpha=", "topX="
            ])
    except getopt.GetoptError as err:
        # print help information and exit:
        print str(err)  # will print something like "option -a not recognized"
        usage()
        sys.exit(2)
    # Set the default values

    optim = "order"
    debug = False
    nb_rounds = 10
    nb_groups = 10
    nb_documents = 16
    nbFeats = 0
    group = 1
    type_tournament = "robin"
    collection_name = 'trec_adhoc_lee'
    output_directory = "/osirim/sig/PROJET/PRINCESS/results/princess/"
    impact = 0
    boost = "undifferentiated"  # or upper or seed
    alpha = 3  # valide si upper or seed
    topx = 20  # valide si seed

    features_to_remove = []
    strategy = 1
    process = 100
    life = 0
    best = 0.1
    accepted = False
    verbose = False
    model = "f45"
    fold = -1
    step = "training"  # ou "test"
    queriesToProcess = []

    strategy = [
        'f48', 'f17', 'f19', 'f16', 'f46', 'f9', 'f21', 'f3', 'f39', 'f7',
        'f40', 'f6', 'f37', 'f42', 'f2', 'f15', 'f25', 'f33', 'f36', 'f10',
        'f30', 'f51', 'f28', 'f43', 'f45', 'f34', 'f24', 'f13', 'f50', 'f27',
        'f31', 'f1', 'f35', 'f14', 'f47', 'f41', 'f4', 'f22', 'f12', 'f8',
        'f26', 'f44'
    ]

    for o, a in opts:
        if o in ("-v", "--verbose"):
            verbose = True
        elif o in ("-a", "--accepted"):
            accepted = True
        elif o in ("-o", "--optim"):
            optim = a
        elif o in ("-d", "--debug"):
            debug = True
        elif o in ("-h", "--help"):
            usage()
            sys.exit()
        elif o in ("-t", "--type"):
            type_tournament = a
        elif o in ("-g", "--group"):
            group = int(a)
        elif o in ("-f", "--featureList"):
            features_to_remove = str(a).split(",")
            # print features_to_remove
        elif o in ("-r", "--rounds"):
            nb_rounds = int(a)
        elif o in ("-m", "--model"):
            model = str(a)
        elif o in ("-p", "--process"):
            process = int(a)
        elif o in ("-b", "--best"):
            best = float(a)
        elif o in ("-c", "--collection"):
            collection_name = a
        elif o in ("-i", "--impact"):
            impact = int(a)
        elif o in ("-l", "--life"):
            life = int(a)
        elif o in ("-n", "--nbFeats"):
            nbFeats = int(a)
        elif o in ("-s", "--strategy"):
            strategy = int(a)
        elif o in ("-w", "--boost"):
            boost = str(a)
        elif o in ("-y", "--alpha"):
            alpha = int(a)
        elif o in ("-z", "--topX"):
            topx = int(a)
        elif o in ("-x", "--cross"):
            fold = a
            if '-' in fold:
                step = "training"
                fold = -int(a)
            else:
                step = "test"

        else:
            assert False, "unhandled option"

    # load appropriate queries for this run
    if "web" in collection_name:
        output_directory += "web2014/" + str(fold) + "/"
        with open(
                "/osirim/sig/PROJET/PRINCESS/queries/web2014/folds/" +
                str(fold) + ".txt", "r") as fq:
            for l in fq:
                queriesToProcess.append(l.strip())
    elif "robust" in collection_name:
        output_directory += "robust2004/" + str(fold) + "/"
        with open(
                "/osirim/sig/PROJET/PRINCESS/queries/robust2004/folds/" +
                str(fold) + ".txt", "r") as fq:
            for l in fq:
                queriesToProcess.append(l.strip())
    else:
        output_directory += collection_name + "/" + str(fold) + "/"
        with open(
                "/osirim/sig/PROJET/PRINCESS/queries/" +
                collection_name.lower() + "/folds/" + str(fold) + ".txt",
                "r") as fq:
            for l in fq:
                queriesToProcess.append(l.strip())

    # One tournament per query
    connection = MongoClient(host='co2-ni01.irit.fr', port=28018)
    db = connection.princess
    collection = db[collection_name.lower()]
    queries = collection.distinct('query')

    if debug:
        evaluateQRels(collection_name)

    outputFolderName = ''

    if step == "training":
        output_directory += "training/"
    else:
        output_directory += "test/"

    if len(features_to_remove) > 0:
        outputFolderName = 't:' + type_tournament + '-o:' + optim + '-r:' + str(
            nb_rounds) + '-b:' + str(
                best) + '-c:' + collection_name + '-i:' + str(
                    impact) + '-l:' + str(life) + '-n:' + str(
                        nbFeats) + '-s:' + str(strategy) + '-g:' + str(
                            group) + '-f:' + ','.join(features_to_remove)
        if accepted:
            outputFolderName += '-a'
    else:
        outputFolderName = 't:' + type_tournament + '-o:' + optim + '-r:' + str(
            nb_rounds) + '-b:' + str(
                best) + '-c:' + collection_name + '-i:' + str(
                    impact) + '-l:' + str(life) + '-n:' + str(
                        nbFeats) + '-s:' + str(strategy) + '-g:' + str(group)

    outputFolderName += '/'
    output_directory += outputFolderName

    if os.path.exists(output_directory):
        os.system("rm -r " + output_directory)

    secure_mkdir(output_directory)

    # print "output directory", output_directory
    os.system("rm " + output_directory + "*")

    print "Nb process", process

    begin = time.time()

    for q in queries:

        # print("docstoCompete:", docsToCompete)

        processQuery = False
        if step == "training":
            if q in queriesToProcess:
                processQuery = False
            else:
                processQuery = True
        else:
            if q in queriesToProcess:
                processQuery = True
            else:
                processQuery = False

        if processQuery:
            print "Query " + q

            deb = time.time()

            docsToCompete = []
            if "indri" not in collection_name:
                docsToCompete = loadDocsToCompete(collection_name, q)

            dictQRels.setdefault(q, {})
            qstr = str(q)
            list = collection.find({'query': qstr}, {'_id': 0, 'docs': 1})
            count = 0
            list_doc = []
            for i in list:
                # print i
                for d in i['docs']:
                    # print "**********"
                    count += 1
                    name = d['doc_name']
                    if len(docsToCompete) == 0 or (len(docsToCompete) > 0
                                                   and name in docsToCompete):
                        # list_feat = []
                        list_feat = {}
                        for f in d['features']:
                            # print f
                            list_feat[f] = Feature(f, d['features'][f])
                            # if float(d['features'][f]) > 1.0 :
                            #	print f + " = "+ str(d['features'][f])
                        if model not in list_feat:
                            list_feat[model] = Feature(model, 0.0)
                        list_doc.append(Document(name, list_feat))
                        # sys.exit()

            colName = collection_name.lower() + "_std"
            #print colName
            collection_std = db[colName]
            listStd = {}
            res = collection_std.find({'query': str(q)}, {'_id': 0})
            # print colName
            # print q
            #print res[0]
            listStd = res[0]['stds']

            if type_tournament == "robin":
                to = RoundRobin(query=q,
                                impact=impact,
                                health=life,
                                nbFeat=nbFeats,
                                strategy=strategy,
                                nbRound=nb_rounds,
                                featsToRemove=features_to_remove,
                                qrel=dictQRels[q],
                                accepted=accepted,
                                optim=optim,
                                listStd=listStd,
                                process=process)
            elif type_tournament == "return":
                to = RoundRobinReturnMatch(query=q,
                                           impact=0,
                                           health=life,
                                           nbFeat=nbFeats,
                                           strategy=strategy,
                                           nbRound=nb_rounds,
                                           featsToRemove=features_to_remove,
                                           accepted=accepted,
                                           optim=optim,
                                           listStd=listStd)
            elif type_tournament == "swiss":
                to = SwissSystem(query=q,
                                 impact=impact,
                                 health=life,
                                 nbFeat=nbFeats,
                                 strategy=strategy,
                                 nbRound=nb_rounds,
                                 featsToRemove=features_to_remove,
                                 accepted=accepted,
                                 optim=optim,
                                 listStd=listStd,
                                 process=process)
            elif type_tournament == "random":
                to = RandomTournament(query=q,
                                      impact=impact,
                                      health=life,
                                      nbFeat=nbFeats,
                                      strategy=strategy,
                                      nbRound=nb_rounds,
                                      featsToRemove=features_to_remove,
                                      qrel=dictQRels[q],
                                      accepted=accepted,
                                      optim=optim,
                                      listStd=listStd)
            elif type_tournament == "grouprobin":
                to = GroupStage(query=q,
                                impact=impact,
                                health=life,
                                nbFeat=nbFeats,
                                strategy=strategy,
                                nbGroups=group,
                                featsToRemove=features_to_remove,
                                qrel=dictQRels[q],
                                best=best,
                                accepted=accepted,
                                optim=optim,
                                listStd=listStd)
            elif type_tournament == "grouprobinoptim":
                to = GroupStageOptim(query=q,
                                     impact=impact,
                                     health=life,
                                     nbFeat=nbFeats,
                                     strategy=strategy,
                                     nbGroups=group,
                                     featsToRemove=features_to_remove,
                                     qrel=dictQRels[q],
                                     best=best,
                                     accepted=accepted,
                                     model=model,
                                     optim=optim,
                                     listStd=listStd,
                                     process=process)
            elif type_tournament == "groupswiss":
                to = GroupSwiss(query=q,
                                impact=impact,
                                health=life,
                                nbFeat=nbFeats,
                                strategy=strategy,
                                nbGroups=group,
                                nbRound=nb_rounds,
                                featsToRemove=features_to_remove,
                                qrel=dictQRels[q],
                                best=best,
                                accepted=accepted,
                                optim=optim,
                                listStd=listStd,
                                process=process)
            elif type_tournament == "groupswissoptim":
                to = GroupSwissOptim(query=q,
                                     impact=impact,
                                     health=life,
                                     nbFeat=nbFeats,
                                     strategy=strategy,
                                     nbGroups=group,
                                     nbRound=nb_rounds,
                                     featsToRemove=features_to_remove,
                                     qrel=dictQRels[q],
                                     best=best,
                                     accepted=accepted,
                                     model=model,
                                     optim=optim,
                                     listStd=listStd,
                                     process=process)
            elif type_tournament == "seed":
                to = Seed(query=q,
                          impact=impact,
                          health=life,
                          nbFeat=nbFeats,
                          strategy=strategy,
                          nbRound=nb_rounds,
                          featsToRemove=features_to_remove,
                          qrel=dictQRels[q],
                          accepted=accepted,
                          model=model,
                          optim=optim,
                          listStd=listStd)
            elif type_tournament == "upper":
                to = Upper(query=q,
                           impact=impact,
                           health=life,
                           nbFeat=nbFeats,
                           strategy=strategy,
                           nbRound=nb_rounds,
                           featsToRemove=features_to_remove,
                           qrel=dictQRels[q],
                           accepted=accepted,
                           model=model,
                           optim=optim,
                           listStd=listStd)
            print "setCompetitors"
            to.setCompetitors(list_doc)
            #print len(list_doc)
            print "runCompetition"
            to.runCompetition()
            print "printResults"
            to.printResults(output_directory)

            print "Query processing time:", (time.time() - deb), "sec"

    print "[ n=", process, type_tournament, "] total time:", (time.time() -
                                                              begin), "ms"
    with open(output_directory + "completed.txt", "w") as f:
        f.write("completed!!")
Exemplo n.º 22
0
 def test_create_new_document(self):
     doc = Document("Test", "Customer", "testfile.PDF")
     self.assertEqual(doc.document_id, None)
     self.assertEqual(doc.project, "Test")
     self.assertEqual(doc.customer, "Customer")
     self.assertEqual(doc.file, "testfile.PDF")
Exemplo n.º 23
0
import os, sys

sys.path.append(os.path.join(os.path.dirname(__file__), "../"))

import pandas as pd
from document.document import Document
from formatter import simple as formatter
from tokenizer import simple as tokenizer

test = {}

doc = Document(formatter=formatter,
               tokenizer=tokenizer,
               file="./data/test.txt")
df = pd.DataFrame([line for line in doc])
df.to_pickle("./data/test_document_simple.gz", compression="gzip")
test["simple"] = df.equals(
    pd.read_pickle("./data/test_document_simple.gz", compression="gzip"))

print(test)
Exemplo n.º 24
0
 def test_set_document_id(self):
     doc = Document("Test", "Customer", "testfile.PDF")
     doc.set_doc_id(1)
     self.assertEqual(doc.document_id, 1)
     doc.set_doc_id(2)
     self.assertEqual(doc.document_id, 2)